From f62f198b2a6f2fe7e17d5599c7665da5934cdd60 Mon Sep 17 00:00:00 2001 From: verku Date: Fri, 15 Sep 2023 15:48:38 +0200 Subject: [PATCH 01/49] Update documentation --- .test/config/config_mitogenomes.yaml | 2 +- .test/config/config_mlRho_options.yaml | 2 +- .test/config/config_pca_roh.yaml | 2 +- .test/config/config_snpeff_gerp.yaml | 2 +- config/config.yaml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.test/config/config_mitogenomes.yaml b/.test/config/config_mitogenomes.yaml index 5019bd3..50acf44 100644 --- a/.test/config/config_mitogenomes.yaml +++ b/.test/config/config_mitogenomes.yaml @@ -494,7 +494,7 @@ gerp_ref_path: "" # Full path to phylogenetic tree of all species included in the analysis # (including the target species) in NEWICK format and including divergence # time estimates. -# Divergence time estimates must be in billions of years for correct scaling +# Divergence time estimates must be in millions of years for correct scaling # of GERP scores (see dated phylogenetic trees from www.timetree.org). # Species names in the tree must be identical to the FASTA file names # without ".fa.gz", ".fasta.gz" or ".fna.gz". diff --git a/.test/config/config_mlRho_options.yaml b/.test/config/config_mlRho_options.yaml index 6876861..9887417 100644 --- a/.test/config/config_mlRho_options.yaml +++ b/.test/config/config_mlRho_options.yaml @@ -494,7 +494,7 @@ gerp_ref_path: "" # Full path to phylogenetic tree of all species included in the analysis # (including the target species) in NEWICK format and including divergence # time estimates. -# Divergence time estimates must be in billions of years for correct scaling +# Divergence time estimates must be in millions of years for correct scaling # of GERP scores (see dated phylogenetic trees from www.timetree.org). # Species names in the tree must be identical to the FASTA file names # without ".fa.gz", ".fasta.gz" or ".fna.gz". diff --git a/.test/config/config_pca_roh.yaml b/.test/config/config_pca_roh.yaml index 40e2097..ef0a968 100644 --- a/.test/config/config_pca_roh.yaml +++ b/.test/config/config_pca_roh.yaml @@ -494,7 +494,7 @@ gerp_ref_path: "" # Full path to phylogenetic tree of all species included in the analysis # (including the target species) in NEWICK format and including divergence # time estimates. -# Divergence time estimates must be in billions of years for correct scaling +# Divergence time estimates must be in millions of years for correct scaling # of GERP scores (see dated phylogenetic trees from www.timetree.org). # Species names in the tree must be identical to the FASTA file names # without ".fa.gz", ".fasta.gz" or ".fna.gz". diff --git a/.test/config/config_snpeff_gerp.yaml b/.test/config/config_snpeff_gerp.yaml index 0c8de76..a2ecb25 100644 --- a/.test/config/config_snpeff_gerp.yaml +++ b/.test/config/config_snpeff_gerp.yaml @@ -494,7 +494,7 @@ gerp_ref_path: ".test/data/gerp_data" # Full path to phylogenetic tree of all species included in the analysis # (including the target species) in NEWICK format and including divergence # time estimates. -# Divergence time estimates must be in billions of years for correct scaling +# Divergence time estimates must be in millions of years for correct scaling # of GERP scores (see dated phylogenetic trees from www.timetree.org). # Species names in the tree must be identical to the FASTA file names # without ".fa.gz", ".fasta.gz" or ".fna.gz". diff --git a/config/config.yaml b/config/config.yaml index ee1c0b1..5124efd 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -496,7 +496,7 @@ gerp_ref_path: "" # Full path to phylogenetic tree of all species included in the analysis # (including the target species) in NEWICK format and including divergence # time estimates. -# Divergence time estimates must be in billions of years for correct scaling +# Divergence time estimates must be in millions of years for correct scaling # of GERP scores (see dated phylogenetic trees from www.timetree.org). # Species names in the tree must be identical to the FASTA file names # without ".fa.gz", ".fasta.gz" or ".fna.gz". From d8c9cd8d459b338db664c127ef64d4b82fd58427 Mon Sep 17 00:00:00 2001 From: verku Date: Fri, 15 Sep 2023 16:04:27 +0200 Subject: [PATCH 02/49] Replace rescale_gerp rule with -s parameter for gerpcol --- .test/config/config_mitogenomes.yaml | 4 +++ .test/config/config_mlRho_options.yaml | 4 +++ .test/config/config_pca_roh.yaml | 4 +++ .test/config/config_snpeff_gerp.yaml | 4 +++ config/config.yaml | 4 +++ workflow/rules/13_GERP.smk | 37 +++++--------------------- 6 files changed, 26 insertions(+), 31 deletions(-) diff --git a/.test/config/config_mitogenomes.yaml b/.test/config/config_mitogenomes.yaml index 50acf44..be5989b 100644 --- a/.test/config/config_mitogenomes.yaml +++ b/.test/config/config_mitogenomes.yaml @@ -500,6 +500,10 @@ gerp_ref_path: "" # without ".fa.gz", ".fasta.gz" or ".fna.gz". tree: "" +# Tree scaling factor for GERP++ ("-s") to re-scale tree. If a tree from +# www.timetree.org is provided (in millions of years), set to 0.001. +tree_scaling_factor: 0.001 + # Minimum and maximum GERP score for a site to be included into calculations # of relative mutational load. # Positive values indicate purifying selection. diff --git a/.test/config/config_mlRho_options.yaml b/.test/config/config_mlRho_options.yaml index 9887417..774f94b 100644 --- a/.test/config/config_mlRho_options.yaml +++ b/.test/config/config_mlRho_options.yaml @@ -500,6 +500,10 @@ gerp_ref_path: "" # without ".fa.gz", ".fasta.gz" or ".fna.gz". tree: "" +# Tree scaling factor for GERP++ ("-s") to re-scale tree. If a tree from +# www.timetree.org is provided (in millions of years), set to 0.001. +tree_scaling_factor: 0.001 + # Minimum and maximum GERP score for a site to be included into calculations # of relative mutational load. # Positive values indicate purifying selection. diff --git a/.test/config/config_pca_roh.yaml b/.test/config/config_pca_roh.yaml index ef0a968..08f6c6e 100644 --- a/.test/config/config_pca_roh.yaml +++ b/.test/config/config_pca_roh.yaml @@ -500,6 +500,10 @@ gerp_ref_path: "" # without ".fa.gz", ".fasta.gz" or ".fna.gz". tree: "" +# Tree scaling factor for GERP++ ("-s") to re-scale tree. If a tree from +# www.timetree.org is provided (in millions of years), set to 0.001. +tree_scaling_factor: 0.001 + # Minimum and maximum GERP score for a site to be included into calculations # of relative mutational load. # Positive values indicate purifying selection. diff --git a/.test/config/config_snpeff_gerp.yaml b/.test/config/config_snpeff_gerp.yaml index a2ecb25..a8db6e3 100644 --- a/.test/config/config_snpeff_gerp.yaml +++ b/.test/config/config_snpeff_gerp.yaml @@ -506,6 +506,10 @@ tree: ".test/data/gerp_data/gerp_tree.nwk" min_gerp: 0 max_gerp: 1000 +# Tree scaling factor for GERP++ ("-s") to re-scale tree. If a tree from +# www.timetree.org is provided (in millions of years), set to 0.001. +tree_scaling_factor: 0.001 + ##### # NOTE: # The GERP step produces a large number of large intermediate files, diff --git a/config/config.yaml b/config/config.yaml index 5124efd..c593eaa1 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -502,6 +502,10 @@ gerp_ref_path: "" # without ".fa.gz", ".fasta.gz" or ".fna.gz". tree: "" +# Tree scaling factor for GERP++ ("-s") to re-scale tree. If a tree from +# www.timetree.org is provided (in millions of years), set to 0.001. +tree_scaling_factor: 0.001 + # Minimum and maximum GERP score for a site to be included into calculations # of relative mutational load. # Positive values indicate purifying selection. diff --git a/workflow/rules/13_GERP.smk b/workflow/rules/13_GERP.smk index cdf9c17..868990e 100644 --- a/workflow/rules/13_GERP.smk +++ b/workflow/rules/13_GERP.smk @@ -596,6 +596,7 @@ rule compute_gerp: gerp_dir=temp(directory("results/gerp/chunks/" + REF_NAME + "/gerp/{chunk}_gerp_raw/")), params: name=REF_NAME, + tree_scale=config["tree_scaling_factor"], log: "results/logs/13_GERP/chunks/" + REF_NAME + "/gerp/{chunk}_compute_gerp.log", threads: 4 @@ -608,7 +609,7 @@ rule compute_gerp: fi for contig in $(awk -F'\t' '{{print $1}}' {input.chunk_bed}) # run the analysis per contig do - gerpcol -v -f {input.concatenated_fasta_dir}/${{contig}}.fasta -t {input.tree} -a -e {params.name} 2> {log} && + gerpcol -v -f {input.concatenated_fasta_dir}/${{contig}}.fasta -t {input.tree} -a -s {params.tree_scale} -e {params.name} 2> {log} && mv {input.concatenated_fasta_dir}/${{contig}}.fasta.rates {output.gerp_dir} 2>> {log} && echo "Computed GERP++ scores for" $contig >> {log} done @@ -650,32 +651,6 @@ rule gerp2coords: """.format(contig=contig)) -rule rescale_gerp: - """ - Re-scale GERP scores to correct time scale. - This analysis is run as one job per genome chunk, but is internally run per contig. - """ - input: - gerp_coords_dir=rules.gerp2coords.output.gerp_coords_dir, - chunk_bed=REF_DIR + "/gerp/" + REF_NAME + "/split_bed_files/{chunk}.bed", - output: - gerp_rescaled_dir=temp(directory("results/gerp/chunks/" + REF_NAME + "/gerp/{chunk}_gerp_rescaled/")), - log: - "results/logs/13_GERP/chunks/" + REF_NAME + "/gerp/{chunk}_rescale_gerp.log", - shell: - """ - if [ ! -d {output.gerp_rescaled_dir} ]; then - mkdir -p {output.gerp_rescaled_dir}; - fi - for contig in $(awk -F'\t' '{{print $1}}' {input.chunk_bed}) # run the analysis per contig - do - awk -F'\t' '{{ if($1 ~ /[0-9]+/ && $1 != 0) {{print $1/1000}} else {{print $1}} }}' OFS='\t' \ - {input.gerp_coords_dir}/${{contig}}.fasta.rates.parsed > {output.gerp_rescaled_dir}/${{contig}}.fasta.rates.parsed.rescaled 2>> {log} && - echo "GERP scores rescaled for" $contig >> {log} - done - """ - - rule get_ancestral_state: """Get the ancestral state of each position in the focal reference genome.""" input: @@ -709,7 +684,7 @@ rule produce_contig_out: """Merge the ancestral allele and gerp-scores into one file per contig.""" input: fasta_ancestral_dir=rules.get_ancestral_state.output.fasta_ancestral_dir, - gerp_rescaled_dir=rules.rescale_gerp.output.gerp_rescaled_dir, + gerp_coords_dir=rules.gerp2coords.output.gerp_coords_dir, chunk_bed=REF_DIR + "/gerp/" + REF_NAME + "/split_bed_files/{chunk}.bed", output: gerp_merged_dir=temp(directory("results/gerp/chunks/" + REF_NAME + "/gerp/{chunk}_gerp_merged/")), @@ -727,9 +702,9 @@ rule produce_contig_out: if [ ! -d {{output.gerp_merged_dir}} ]; then mkdir -p {{output.gerp_merged_dir}}; fi - paste {{input.fasta_ancestral_dir}}/{contig}.fasta.parsed {{input.gerp_rescaled_dir}}/{contig}.fasta.rates.parsed.rescaled | \ + paste {{input.fasta_ancestral_dir}}/{contig}.fasta.parsed {{input.gerp_coords_dir}}/{contig}.fasta.rates.parsed | \ sed "s/^/{contig}\t/g" > {{output.gerp_merged_dir}}/{contig}.fasta.parsed.rates 2>> {{log}} && - echo "Rescaled GERP-scores and ancestral states merged for {contig}" >> {{log}} + echo "GERP-scores and ancestral states merged for {contig}" >> {{log}} """.format(contig=contig)) @@ -772,7 +747,7 @@ rule merge_gerp_gz: rule plot_gerp_hist: - """Plot the rescaled GERP scores as histogram""" + """Plot the GERP scores as histogram""" input: gerp_out=rules.merge_gerp_gz.output.gerp_out, output: From cc37ee22bd6a7450b08429f2f17e7c668ce39c8a Mon Sep 17 00:00:00 2001 From: verku Date: Fri, 22 Sep 2023 09:23:45 +0200 Subject: [PATCH 03/49] Move scaling factor to compute_gerp rule so that the output is standardised but users can change the scaling directly in the rule --- config/config.yaml | 4 ---- workflow/rules/13_GERP.smk | 8 ++++++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index c593eaa1..5124efd 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -502,10 +502,6 @@ gerp_ref_path: "" # without ".fa.gz", ".fasta.gz" or ".fna.gz". tree: "" -# Tree scaling factor for GERP++ ("-s") to re-scale tree. If a tree from -# www.timetree.org is provided (in millions of years), set to 0.001. -tree_scaling_factor: 0.001 - # Minimum and maximum GERP score for a site to be included into calculations # of relative mutational load. # Positive values indicate purifying selection. diff --git a/workflow/rules/13_GERP.smk b/workflow/rules/13_GERP.smk index 868990e..611da8e 100644 --- a/workflow/rules/13_GERP.smk +++ b/workflow/rules/13_GERP.smk @@ -585,6 +585,11 @@ rule concatenate_fasta_per_contig: rule compute_gerp: """ Compute GERP++ scores. + '-s 0.001': tree scaling factor to re-scale tree. Set to 0.001 to scale + a tree from millions of years (such as trees from www.timetree.org) to + billions of years. GERP++ default: 1.0 + '-v': verbose mode + '-a': alignment in mfa format Output only includes positions, no contig names. This analysis is run as one job per genome chunk, but is internally run per contig. """ @@ -596,7 +601,6 @@ rule compute_gerp: gerp_dir=temp(directory("results/gerp/chunks/" + REF_NAME + "/gerp/{chunk}_gerp_raw/")), params: name=REF_NAME, - tree_scale=config["tree_scaling_factor"], log: "results/logs/13_GERP/chunks/" + REF_NAME + "/gerp/{chunk}_compute_gerp.log", threads: 4 @@ -609,7 +613,7 @@ rule compute_gerp: fi for contig in $(awk -F'\t' '{{print $1}}' {input.chunk_bed}) # run the analysis per contig do - gerpcol -v -f {input.concatenated_fasta_dir}/${{contig}}.fasta -t {input.tree} -a -s {params.tree_scale} -e {params.name} 2> {log} && + gerpcol -v -f {input.concatenated_fasta_dir}/${{contig}}.fasta -t {input.tree} -a -s 0.001 -e {params.name} 2> {log} && mv {input.concatenated_fasta_dir}/${{contig}}.fasta.rates {output.gerp_dir} 2>> {log} && echo "Computed GERP++ scores for" $contig >> {log} done From c57b6ec5954c934ef95229e1f201bdc55a3dc71c Mon Sep 17 00:00:00 2001 From: verku Date: Fri, 22 Sep 2023 09:38:04 +0200 Subject: [PATCH 04/49] Clarify documentation --- workflow/rules/13_GERP.smk | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflow/rules/13_GERP.smk b/workflow/rules/13_GERP.smk index 611da8e..1d8a606 100644 --- a/workflow/rules/13_GERP.smk +++ b/workflow/rules/13_GERP.smk @@ -587,7 +587,8 @@ rule compute_gerp: Compute GERP++ scores. '-s 0.001': tree scaling factor to re-scale tree. Set to 0.001 to scale a tree from millions of years (such as trees from www.timetree.org) to - billions of years. GERP++ default: 1.0 + billions of years to ensure consistently scaled GERP scores across GenErode + runs. GERP++ gerpcol default: 1.0 '-v': verbose mode '-a': alignment in mfa format Output only includes positions, no contig names. From 79d338496f5d6a0350618ea2318c0c92fbe4c2f1 Mon Sep 17 00:00:00 2001 From: verku Date: Fri, 22 Sep 2023 09:39:47 +0200 Subject: [PATCH 05/49] Clarify documentation about input tree and GERP score scaling --- config/config.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 5124efd..fcbc17b 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -496,8 +496,9 @@ gerp_ref_path: "" # Full path to phylogenetic tree of all species included in the analysis # (including the target species) in NEWICK format and including divergence # time estimates. -# Divergence time estimates must be in millions of years for correct scaling -# of GERP scores (see dated phylogenetic trees from www.timetree.org). +# Divergence time estimates must be in millions of years for consistent scaling +# of GERP scores across GenErode runs. Dated phylogenetic trees in millions of years +# are e.g. available from www.timetree.org. # Species names in the tree must be identical to the FASTA file names # without ".fa.gz", ".fasta.gz" or ".fna.gz". tree: "" From ec8b7ea2755c42d24615ba40bc985fb4ee5dfaf1 Mon Sep 17 00:00:00 2001 From: verku Date: Wed, 4 Oct 2023 11:41:16 +0200 Subject: [PATCH 06/49] Fix typo --- workflow/rules/6_autosome_sexchromosome_bed_files.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/6_autosome_sexchromosome_bed_files.smk b/workflow/rules/6_autosome_sexchromosome_bed_files.smk index 36777ef..eda4d61 100644 --- a/workflow/rules/6_autosome_sexchromosome_bed_files.smk +++ b/workflow/rules/6_autosome_sexchromosome_bed_files.smk @@ -1,5 +1,5 @@ ########################################################################## -### 6.3 Generate BED files with sex-chromosomal and autosomal scaffolds (e.g. for mlRho or other downstream analyses) +### 6 Generate BED files with sex-chromosomal and autosomal scaffolds (e.g. for mlRho or other downstream analyses) # Code collecting output files from this part of the pipeline if len(sexchromosomeList) > 0: From 920f86c64ab21b4a9ec54fa7083ca9ba6e4a65e9 Mon Sep 17 00:00:00 2001 From: verku Date: Wed, 4 Oct 2023 11:41:24 +0200 Subject: [PATCH 07/49] Update documentation --- workflow/rules/common.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 576cf4c..c8954a9 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -364,7 +364,7 @@ ALL_SAMPLES = list(hist_sm + mod_sm) ### -# mlRho: sex chromosomal scaffolds +# mlRho, merge VCFs: sex chromosomal scaffolds sexchromosomeList = [] # fill the list with scaffold/contig names from the list of sex chromosome-linked scaffolds/contigs, if available if os.path.exists(config["sexchromosomes"]): with open(config["sexchromosomes"], "r") as file: From 5ec6e1ac40b6654829b71010199f3edcd9121480 Mon Sep 17 00:00:00 2001 From: verku Date: Wed, 4 Oct 2023 11:43:47 +0200 Subject: [PATCH 08/49] Integrate creation of chromosomal bed files into pipeline steps and update documentation --- Snakefile | 21 ++++++++-------- config/config.yaml | 62 +++++++++++++++++++++++----------------------- 2 files changed, 41 insertions(+), 42 deletions(-) diff --git a/Snakefile b/Snakefile index 1b3c4fc..6d6afab 100644 --- a/Snakefile +++ b/Snakefile @@ -96,6 +96,7 @@ if config["CpG_identification"]: include: "workflow/rules/3.3_bam_subsampling.smk" include: "workflow/rules/4_genotyping.smk" include: "workflow/rules/5_CpG_identification.smk" + include: "workflow/rules/6_autosome_sexchromosome_bed_files.smk" elif config["CpG_from_reference"] == True: @@ -107,6 +108,7 @@ if config["CpG_identification"]: include: "workflow/rules/3.2_historical_bam_mapDamage.smk" include: "workflow/rules/3.3_bam_subsampling.smk" include: "workflow/rules/5_CpG_identification.smk" + include: "workflow/rules/6_autosome_sexchromosome_bed_files.smk" elif config["CpG_from_vcf_and_reference"] == True: @@ -119,20 +121,10 @@ if config["CpG_identification"]: include: "workflow/rules/3.3_bam_subsampling.smk" include: "workflow/rules/4_genotyping.smk" include: "workflow/rules/5_CpG_identification.smk" + include: "workflow/rules/6_autosome_sexchromosome_bed_files.smk" ### -if config["autosome_sexchromosome_bed_files"]: - include: "workflow/rules/0.1_reference_genome_preps.smk" - include: "workflow/rules/0.2_repeat_identification.smk" - include: "workflow/rules/1.1_fastq_processing.smk" - include: "workflow/rules/2_mapping.smk" - include: "workflow/rules/3.1_bam_rmdup_realign_indels.smk" - include: "workflow/rules/3.2_historical_bam_mapDamage.smk" - include: "workflow/rules/3.3_bam_subsampling.smk" - include: "workflow/rules/6_autosome_sexchromosome_bed_files.smk" - - if config["mlRho"]: if len(config["CpG_samplenames"]) > 0: # to avoid genotyping if not necessary if config["CpG_from_vcf"] == True: @@ -196,6 +188,7 @@ if config["vcf_CpG_filtering"]: include: "workflow/rules/3.3_bam_subsampling.smk" include: "workflow/rules/4_genotyping.smk" include: "workflow/rules/5_CpG_identification.smk" + include: "workflow/rules/6_autosome_sexchromosome_bed_files.smk" include: "workflow/rules/8.1_vcf_CpG_filtering.smk" @@ -209,6 +202,7 @@ if config["vcf_qual_repeat_filtering"]: include: "workflow/rules/3.3_bam_subsampling.smk" include: "workflow/rules/4_genotyping.smk" include: "workflow/rules/5_CpG_identification.smk" + include: "workflow/rules/6_autosome_sexchromosome_bed_files.smk" include: "workflow/rules/8.1_vcf_CpG_filtering.smk" include: "workflow/rules/8.2_vcf_qual_repeat_filtering.smk" @@ -223,6 +217,7 @@ if config["merge_vcfs_per_dataset"]: include: "workflow/rules/3.3_bam_subsampling.smk" include: "workflow/rules/4_genotyping.smk" include: "workflow/rules/5_CpG_identification.smk" + include: "workflow/rules/6_autosome_sexchromosome_bed_files.smk" include: "workflow/rules/8.1_vcf_CpG_filtering.smk" include: "workflow/rules/8.2_vcf_qual_repeat_filtering.smk" include: "workflow/rules/9_merge_vcfs.smk" @@ -238,6 +233,7 @@ if config["pca"]: include: "workflow/rules/3.3_bam_subsampling.smk" include: "workflow/rules/4_genotyping.smk" include: "workflow/rules/5_CpG_identification.smk" + include: "workflow/rules/6_autosome_sexchromosome_bed_files.smk" include: "workflow/rules/8.1_vcf_CpG_filtering.smk" include: "workflow/rules/8.2_vcf_qual_repeat_filtering.smk" include: "workflow/rules/9_merge_vcfs.smk" @@ -254,6 +250,7 @@ if config["ROH"]: include: "workflow/rules/3.3_bam_subsampling.smk" include: "workflow/rules/4_genotyping.smk" include: "workflow/rules/5_CpG_identification.smk" + include: "workflow/rules/6_autosome_sexchromosome_bed_files.smk" include: "workflow/rules/8.1_vcf_CpG_filtering.smk" include: "workflow/rules/8.2_vcf_qual_repeat_filtering.smk" include: "workflow/rules/9_merge_vcfs.smk" @@ -270,6 +267,7 @@ if config["snpEff"]: include: "workflow/rules/3.3_bam_subsampling.smk" include: "workflow/rules/4_genotyping.smk" include: "workflow/rules/5_CpG_identification.smk" + include: "workflow/rules/6_autosome_sexchromosome_bed_files.smk" include: "workflow/rules/8.1_vcf_CpG_filtering.smk" include: "workflow/rules/8.2_vcf_qual_repeat_filtering.smk" include: "workflow/rules/9_merge_vcfs.smk" @@ -287,6 +285,7 @@ if config["gerp"]: include: "workflow/rules/3.3_bam_subsampling.smk" include: "workflow/rules/4_genotyping.smk" include: "workflow/rules/5_CpG_identification.smk" + include: "workflow/rules/6_autosome_sexchromosome_bed_files.smk" include: "workflow/rules/8.1_vcf_CpG_filtering.smk" include: "workflow/rules/8.2_vcf_qual_repeat_filtering.smk" include: "workflow/rules/9_merge_vcfs.smk" diff --git a/config/config.yaml b/config/config.yaml index ee1c0b1..052f5d6 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -21,6 +21,22 @@ # The file name will be reused by the pipeline and can have the file # name extensions *.fasta, *.fa or *.fna. ref_path: "" + + +# OPTIONAL: +# Relative path (from the main pipeline directory) to file listing +# scaffolds/contigs linked to sex chromosomes (one scaffold/contig +# name per line). +# Is used to create BED files to run mlRho separately for autosomes +# and sex chromosomes or exclusively for autosomes, and/or to create +# an autosome-only BCF file for PCA, ROH, snpEff and GERP analyses. +# Can also be used to specify any other contigs/scaffolds, e.g. +# unplaced or short scaffolds, for removal from the mlRho analysis +# and BCF file. +# Leave empty ("") if identity of sex chromosomes is unknown and/or +# if the pipeline should be run on all scaffolds/contigs of the genome. +sexchromosomes: "" # for example, "config/chrX_candidate_scaffolds.txt" +################################################################# ################################################################# @@ -213,8 +229,8 @@ genotyping: False ##### # OPTIONAL: -# Identify CpG sites for removal from VCF files and from -# downstream analyses and define samples to be CpG filtered. +# Identify CpG sites for removal from mlRho analyses, from VCF files +# and downstream analyses and define samples to be CpG filtered. # Three different methods are available to identify CpG sites. # This step will generate several BED files containing genome @@ -275,29 +291,9 @@ CpG_samplenames: [] ################################################################# ################################################################# -# Rules for BAM file processing for mlRho, and mlRho # +# Rules for mlRho analyses # ################################################################# -##### -# OPTIONAL: -# Generate BED files of autosomes and sex chromosomes for mlRho -# analyses, in case these should be analyzed separately from each -# other (see below for further options). -# Includes intersecting of the new chromosome-specific BED files -# with CpG- and repeat-masking BED files for downstream filtering. -autosome_sexchromosome_bed_files: False - -# Relative path (from the main pipeline directory) to file listing -# scaffolds/contigs linked to sex chromosomes (one scaffold/contig -# name per line). -# Leave empty ("") if identity of sex chromosomes is unknown and/or -# if mlRho should be run on all scaffolds/contigs of the genome. -# Keep the path to the file when running the next step (mlRho) -# separately for autosomes and sex chromosomes or only for autosomes. -sexchromosomes: "" # for example, "config/chrX_candidate_scaffolds.txt" -##### - - ##### # Run mlRho 2.9 on filtered BAM files. # Automatically generates a PDF file with a plot of genome-wide @@ -315,21 +311,22 @@ mlRho: False # and/or mlRho should be run on all contigs/scaffolds, # set mlRho_autosomes_sexchromosomes to False and do not provide # a path to a text file with sex-chromosomal contigs/scaffolds -# above when running mlRho. +# with the reference genome when running mlRho. # # 2) If the identity of sex-chromosomal contigs/scaffolds is known, # mlRho analyses can be run for autosomes and sex chromosomes # separately from each other. # In that case, set mlRho_autosomes_sexchromosomes to True and # provide the path to the file with sex-chromosomal contigs/scaffolds -# above when running mlRho. +# with the reference genome when running mlRho. # # 3) If the identity of sex-chromosomal contigs/scaffolds is known, -# sex-chromosomal contigs/scaffolds can be entirely excluded from +# sex-chromosomal contigs/scaffolds (or other contigs/scaffolds such +# as unplaced or short scaffolds) can be entirely excluded from # the analysis. # In that case, set mlRho_autosomes_sexchromosomes to False and # provide the path to the file with sex-chromosomal contigs/scaffolds -# above when running mlRho. +# with the reference genome when running mlRho. mlRho_autosomes_sexchromosomes: False ##### ################################################################# @@ -362,11 +359,13 @@ vcf_qual_repeat_filtering: False ##### # Merge BCF files into a BCF file containing all samples and remove all # sites that are not biallelic and with missing data across all samples -# up to a certain threshold as defined below. +# up to a certain threshold as defined below. Remove any sex-chromosomal +# contigs/scaffolds (optional). # Extract 1) all historical and 2) all modern samples from the merged and # filtered BCF file. -# Create a BED file of sites that remain after filtering across all samples -# to be used for downstream filtering of individual BCF files. +# Create a BED file of sites that remain after filtering and contig/scaffold +# removal across all samples to be used for downstream filtering of individual +# BCF files. merge_vcfs_per_dataset: False # Maximum allowed fraction of missing genotypes across all samples for a @@ -519,4 +518,5 @@ max_gerp: 1000 # scaffolds). ##### -##### +################################################################# +################################################################# From 52b9c827aa3d2f78700181c3a16a08029edeb7b0 Mon Sep 17 00:00:00 2001 From: verku Date: Wed, 4 Oct 2023 12:24:30 +0200 Subject: [PATCH 09/49] Add rule to remove sex chromosomes (optional) from merged VCF file and connect to pipeline --- config/config.yaml | 19 +++++++ workflow/rules/9_merge_vcfs.smk | 97 ++++++++++++++++++++++----------- 2 files changed, 85 insertions(+), 31 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 052f5d6..b68d438 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -374,6 +374,23 @@ merge_vcfs_per_dataset: False # point number between 0.0 (no missing data allowed) and 1.0 (sites are # allowed that are completely missing). f_missing: 0.1 # default: 0.1 (i.e. maximum 10% missing genotypes per site) + +# Removal of sex-chromosomal contigs/scaffolds from the merged +# BCF file. Can also be used to remove any other contigs/scaffolds, +# e.g. unplaced or short scaffolds, from the BCF files. + +# 1) If all contigs/scaffolds should be kept in the BCF file and +# downstream analyses and/or if the identity of sex-chromosomal contigs/ +# scaffolds is unknown, set vcf_remove_chromosomes to False and do +# not provide a path to a text file with sex-chromosomal contigs/ +# scaffolds with the reference genome. +# +# 2) If sex-chromosomal contigs/scaffolds (or other contigs/scaffolds +# such as unplaced or short scaffolds) should be entirely excluded from +# the merged BCF file and downstream analyses, set vcf_remove_chromosomes +# to True and provide the path to the file with sex-chromosomal contigs/scaffolds +# with the reference genome. +vcf_remove_chromosomes: False ##### ################################################################# @@ -454,6 +471,8 @@ snpEff: False # same directory as the GTF file. # That way, the database has to be built only once for a given GTF file. gtf_path: "" +##### + ################################################################# ################################################################# diff --git a/workflow/rules/9_merge_vcfs.smk b/workflow/rules/9_merge_vcfs.smk index 56bed87..c130a01 100644 --- a/workflow/rules/9_merge_vcfs.smk +++ b/workflow/rules/9_merge_vcfs.smk @@ -1,5 +1,5 @@ ########################################################################## -### 9. Merge VCF files +### 9. Merge VCF files and filter for biallelic sites, missingness and sex-chromosomal contigs/scaffolds # Code collecting output files from this part of the pipeline all_outputs.append("results/all/vcf/" + REF_NAME + "/stats/vcf_merged_missing/multiqc/multiqc_report.html") @@ -151,16 +151,28 @@ def merge_all_index_inputs(wildcards): def missingness_filtered_vcf_multiqc_inputs(wildcards): """Input for missingness_filtered_vcf_multiqc""" - if os.path.exists(config["historical_samples"]) and os.path.exists(config["modern_samples"]): - return expand("results/{dataset}/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.vcf.stats.txt", - dataset=["all", "historical", "modern"], - fmiss=config["f_missing"],) - elif os.path.exists(config["historical_samples"]): - return expand("results/historical/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".historical.merged.biallelic.fmissing{fmiss}.vcf.stats.txt", + if config["vcf_remove_chromosomes"]: + if os.path.exists(config["historical_samples"]) and os.path.exists(config["modern_samples"]): + return expand("results/{dataset}/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.autos.vcf.stats.txt", + dataset=["all", "historical", "modern"], fmiss=config["f_missing"],) - elif os.path.exists(config["modern_samples"]): - return expand("results/modern/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".modern.merged.biallelic.fmissing{fmiss}.vcf.stats.txt", + elif os.path.exists(config["historical_samples"]): + return expand("results/historical/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".historical.merged.biallelic.fmissing{fmiss}.autos.vcf.stats.txt", + fmiss=config["f_missing"],) + elif os.path.exists(config["modern_samples"]): + return expand("results/modern/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".modern.merged.biallelic.fmissing{fmiss}.autos.vcf.stats.txt", + fmiss=config["f_missing"],) + else: + if os.path.exists(config["historical_samples"]) and os.path.exists(config["modern_samples"]): + return expand("results/{dataset}/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.vcf.stats.txt", + dataset=["all", "historical", "modern"], fmiss=config["f_missing"],) + elif os.path.exists(config["historical_samples"]): + return expand("results/historical/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".historical.merged.biallelic.fmissing{fmiss}.vcf.stats.txt", + fmiss=config["f_missing"],) + elif os.path.exists(config["modern_samples"]): + return expand("results/modern/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".modern.merged.biallelic.fmissing{fmiss}.vcf.stats.txt", + fmiss=config["f_missing"],) # snakemake rules @@ -288,7 +300,7 @@ rule biallelic_filtered_vcf_stats: rule biallelic_filtered_vcf_multiqc: - """Collect all stats files from merged vcf files filtered for missing data""" + """Collect all stats files from merged vcf files filtered for biallelic sites""" input: rules.biallelic_filtered_vcf_stats.output.stats, output: @@ -312,7 +324,6 @@ rule filter_vcf_missing: input: bcf=rules.filter_vcf_biallelic.output.bcf, index=rules.filter_vcf_biallelic.output.index, - stats=rules.biallelic_filtered_vcf_stats.output.stats, multiqc=rules.biallelic_filtered_vcf_multiqc.output.stats, output: vcf="results/all/vcf/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}.vcf.gz", @@ -344,14 +355,40 @@ rule filter_vcf_missing: """ +rule remove_chromosomes: + input: + bcf=rules.filter_vcf_missing.output.bcf, + index=rules.filter_vcf_missing.output.index, + multiqc=rules.missing_filtered_vcf_multiqc.output.stats, + output: + vcf="results/all/vcf/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}.autos.vcf.gz", + index="results/all/vcf/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}.autos.vcf.gz.csi", + threads: 2 + params: + exclude = sexchromosomeList.join(",") # parse list with contigs/scaffolds to exclude and convert to format chr1,chr2,chr3 + log: + "results/logs/9_merge_vcfs/" + REF_NAME + ".all_fmissing{fmiss}.autos_remove_chromosomes.log", + singularity: + "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" + shell: + """ + bcftools view {input.bcf} \ + -t ^{params.exclude} \ + -O z -o {output.vcf} + + bcftools index -f {output.vcf} 2>> {log} + """ + + rule filtered_vcf2bed: - """Convert the VCF file after removal of missing data to BED file containing the remaining sites""" + """Convert the VCF file after removal of missing data (and optionally sex chromosomes) to BED file containing the remaining sites""" input: - vcf=rules.filter_vcf_missing.output.vcf, + vcf="results/all/vcf/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}{autos}.vcf.gz", + index="results/all/vcf/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}{autos}.vcf.gz.csi", output: - bed="results/all/vcf/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}.bed", + bed="results/all/vcf/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}{autos}.bed", log: - "results/logs/9_merge_vcfs/" + REF_NAME + ".all_fmissing{fmiss}_filtered_vcf2bed.log", + "results/logs/9_merge_vcfs/" + REF_NAME + ".all_fmissing{fmiss}{autos}_filtered_vcf2bed.log", singularity: "docker://nbisweden/generode-bedtools-2.29.2" shell: @@ -362,15 +399,14 @@ rule filtered_vcf2bed: rule extract_historical_samples: input: - vcf=rules.filter_vcf_missing.output.vcf, - index=rules.filter_vcf_missing.output.index, - stats=rules.biallelic_filtered_vcf_stats.output.stats, + vcf="results/all/vcf/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}{autos}.vcf.gz", + index="results/all/vcf/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}{autos}.vcf.gz.csi", bed=rules.filtered_vcf2bed.output.bed, output: - vcf="results/historical/vcf/" + REF_NAME + ".historical.merged.biallelic.fmissing{fmiss}.vcf.gz", - index="results/historical/vcf/" + REF_NAME + ".historical.merged.biallelic.fmissing{fmiss}.vcf.gz.csi", + vcf="results/historical/vcf/" + REF_NAME + ".historical.merged.biallelic.fmissing{fmiss}{autos}.vcf.gz", + index="results/historical/vcf/" + REF_NAME + ".historical.merged.biallelic.fmissing{fmiss}{autos}.vcf.gz.csi", log: - "results/logs/9_merge_vcfs/" + REF_NAME + ".historical_fmissing{fmiss}_extract_historical_samples.log", + "results/logs/9_merge_vcfs/" + REF_NAME + ".historical_fmissing{fmiss}{autos}_extract_historical_samples.log", singularity: "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" params: @@ -396,15 +432,14 @@ rule extract_historical_samples: rule extract_modern_samples: input: - vcf=rules.filter_vcf_missing.output.vcf, - index=rules.filter_vcf_missing.output.index, - stats=rules.biallelic_filtered_vcf_stats.output.stats, + vcf="results/all/vcf/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}{autos}.vcf.gz", + index="results/all/vcf/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}{autos}.vcf.gz.csi", bed=rules.filtered_vcf2bed.output.bed, output: - vcf="results/modern/vcf/" + REF_NAME + ".modern.merged.biallelic.fmissing{fmiss}.vcf.gz", - index="results/modern/vcf/" + REF_NAME + ".modern.merged.biallelic.fmissing{fmiss}.vcf.gz.csi", + vcf="results/modern/vcf/" + REF_NAME + ".modern.merged.biallelic.fmissing{fmiss}{autos}.vcf.gz", + index="results/modern/vcf/" + REF_NAME + ".modern.merged.biallelic.fmissing{fmiss}{autos}.vcf.gz.csi", log: - "results/logs/9_merge_vcfs/" + REF_NAME + ".modern_fmissing{fmiss}_extract_modern_samples.log", + "results/logs/9_merge_vcfs/" + REF_NAME + ".modern_fmissing{fmiss}{autos}_extract_modern_samples.log", singularity: "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" params: @@ -431,12 +466,12 @@ rule extract_modern_samples: rule missingness_filtered_vcf_stats: """Obtain summary stats of merged vcf file""" input: - merged="results/{dataset}/vcf/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.vcf.gz", - index="results/{dataset}/vcf/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.vcf.gz.csi", + merged="results/{dataset}/vcf/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}{autos}.vcf.gz", + index="results/{dataset}/vcf/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}{autos}.vcf.gz.csi", output: - stats="results/{dataset}/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.vcf.stats.txt", + stats="results/{dataset}/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}{autos}.vcf.stats.txt", log: - "results/logs/9_merge_vcfs/" + REF_NAME + ".{dataset}_fmissing{fmiss}_missingness_filtered_vcf_stats.log", + "results/logs/9_merge_vcfs/" + REF_NAME + ".{dataset}_fmissing{fmiss}{autos}_missingness_filtered_vcf_stats.log", singularity: "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" shell: From a982b81e670ae53a35a2921595f30f31966c9dbb Mon Sep 17 00:00:00 2001 From: verku Date: Thu, 5 Oct 2023 13:35:57 +0200 Subject: [PATCH 10/49] Implement sex chromosome removal from BCF files the same way as from mlRho to be compatible in a given pipeline run --- config/config.yaml | 37 +++++++++++---------------------- workflow/rules/9_merge_vcfs.smk | 9 ++++---- 2 files changed, 16 insertions(+), 30 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index b68d438..3844bce 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -27,9 +27,10 @@ ref_path: "" # Relative path (from the main pipeline directory) to file listing # scaffolds/contigs linked to sex chromosomes (one scaffold/contig # name per line). -# Is used to create BED files to run mlRho separately for autosomes -# and sex chromosomes or exclusively for autosomes, and/or to create -# an autosome-only BCF file for PCA, ROH, snpEff and GERP analyses. +# If provided, the listed scaffolds/contigs are excluded from mlRho +# analyses (but see below how to run mlRho for autosomes and sex +# chromosomes separately from each other) and from the merged BCF file +# that is used for PCA, ROH, snpEff and GERP analyses. # Can also be used to specify any other contigs/scaffolds, e.g. # unplaced or short scaffolds, for removal from the mlRho analysis # and BCF file. @@ -311,14 +312,14 @@ mlRho: False # and/or mlRho should be run on all contigs/scaffolds, # set mlRho_autosomes_sexchromosomes to False and do not provide # a path to a text file with sex-chromosomal contigs/scaffolds -# with the reference genome when running mlRho. +# with the reference genome ("sexchromosomes") when running mlRho. # # 2) If the identity of sex-chromosomal contigs/scaffolds is known, # mlRho analyses can be run for autosomes and sex chromosomes # separately from each other. # In that case, set mlRho_autosomes_sexchromosomes to True and # provide the path to the file with sex-chromosomal contigs/scaffolds -# with the reference genome when running mlRho. +# with the reference genome ("sexchromosomes") when running mlRho. # # 3) If the identity of sex-chromosomal contigs/scaffolds is known, # sex-chromosomal contigs/scaffolds (or other contigs/scaffolds such @@ -326,7 +327,7 @@ mlRho: False # the analysis. # In that case, set mlRho_autosomes_sexchromosomes to False and # provide the path to the file with sex-chromosomal contigs/scaffolds -# with the reference genome when running mlRho. +# with the reference genome ("sexchromosomes") when running mlRho. mlRho_autosomes_sexchromosomes: False ##### ################################################################# @@ -359,8 +360,11 @@ vcf_qual_repeat_filtering: False ##### # Merge BCF files into a BCF file containing all samples and remove all # sites that are not biallelic and with missing data across all samples -# up to a certain threshold as defined below. Remove any sex-chromosomal -# contigs/scaffolds (optional). +# up to a certain threshold as defined below. +# If the path to a file with sex-chromosomal contigs/scaffolds is provided +# with the reference genome ("sexchromosomes"), these scaffolds/contigs are +# removed from the merged and filtered BCF file and all downstream analyses +# (optional). # Extract 1) all historical and 2) all modern samples from the merged and # filtered BCF file. # Create a BED file of sites that remain after filtering and contig/scaffold @@ -374,23 +378,6 @@ merge_vcfs_per_dataset: False # point number between 0.0 (no missing data allowed) and 1.0 (sites are # allowed that are completely missing). f_missing: 0.1 # default: 0.1 (i.e. maximum 10% missing genotypes per site) - -# Removal of sex-chromosomal contigs/scaffolds from the merged -# BCF file. Can also be used to remove any other contigs/scaffolds, -# e.g. unplaced or short scaffolds, from the BCF files. - -# 1) If all contigs/scaffolds should be kept in the BCF file and -# downstream analyses and/or if the identity of sex-chromosomal contigs/ -# scaffolds is unknown, set vcf_remove_chromosomes to False and do -# not provide a path to a text file with sex-chromosomal contigs/ -# scaffolds with the reference genome. -# -# 2) If sex-chromosomal contigs/scaffolds (or other contigs/scaffolds -# such as unplaced or short scaffolds) should be entirely excluded from -# the merged BCF file and downstream analyses, set vcf_remove_chromosomes -# to True and provide the path to the file with sex-chromosomal contigs/scaffolds -# with the reference genome. -vcf_remove_chromosomes: False ##### ################################################################# diff --git a/workflow/rules/9_merge_vcfs.smk b/workflow/rules/9_merge_vcfs.smk index c130a01..9dc4096 100644 --- a/workflow/rules/9_merge_vcfs.smk +++ b/workflow/rules/9_merge_vcfs.smk @@ -151,7 +151,7 @@ def merge_all_index_inputs(wildcards): def missingness_filtered_vcf_multiqc_inputs(wildcards): """Input for missingness_filtered_vcf_multiqc""" - if config["vcf_remove_chromosomes"]: + if len(sexchromosomeList) > 0: if os.path.exists(config["historical_samples"]) and os.path.exists(config["modern_samples"]): return expand("results/{dataset}/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.autos.vcf.stats.txt", dataset=["all", "historical", "modern"], @@ -162,7 +162,7 @@ def missingness_filtered_vcf_multiqc_inputs(wildcards): elif os.path.exists(config["modern_samples"]): return expand("results/modern/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".modern.merged.biallelic.fmissing{fmiss}.autos.vcf.stats.txt", fmiss=config["f_missing"],) - else: + elif len(sexchromosomeList) == 0: if os.path.exists(config["historical_samples"]) and os.path.exists(config["modern_samples"]): return expand("results/{dataset}/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.vcf.stats.txt", dataset=["all", "historical", "modern"], @@ -357,15 +357,14 @@ rule filter_vcf_missing: rule remove_chromosomes: input: - bcf=rules.filter_vcf_missing.output.bcf, + bcf=rules.filter_vcf_missing.output.vcf, index=rules.filter_vcf_missing.output.index, - multiqc=rules.missing_filtered_vcf_multiqc.output.stats, output: vcf="results/all/vcf/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}.autos.vcf.gz", index="results/all/vcf/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}.autos.vcf.gz.csi", threads: 2 params: - exclude = sexchromosomeList.join(",") # parse list with contigs/scaffolds to exclude and convert to format chr1,chr2,chr3 + exclude = ",".join(sexchromosomeList) # parse list with contigs/scaffolds to exclude and convert to format chr1,chr2,chr3 for removal with bcftools view log: "results/logs/9_merge_vcfs/" + REF_NAME + ".all_fmissing{fmiss}.autos_remove_chromosomes.log", singularity: From 80b5bd58375d3a21434e17f64cae1519eb3a27d6 Mon Sep 17 00:00:00 2001 From: verku Date: Thu, 5 Oct 2023 13:55:16 +0200 Subject: [PATCH 11/49] Replace all with genome to be consistent with chr wildcard in 9_merge_vcfs.smk and to describe the file content better --- workflow/rules/7_mlRho.smk | 76 +++++++++++++++++++------------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/workflow/rules/7_mlRho.smk b/workflow/rules/7_mlRho.smk index 53bd40d..a48fbd6 100644 --- a/workflow/rules/7_mlRho.smk +++ b/workflow/rules/7_mlRho.smk @@ -70,7 +70,7 @@ def bed_file_sexchr_mlRho(wildcards): bed = "results/" + REF_NAME + ".repma.sexchr.bed" return bed -def bed_file_all_mlRho(wildcards): +def bed_file_genome_mlRho(wildcards): """Select correct bed file for filtering during mlRho analysis""" if len(sexchromosomeList) == 0: if config["CpG_from_vcf"] == True: @@ -201,50 +201,50 @@ def all_mlRho_outputs(wildcards): DP=config["subsampling_depth"],) outlist += (rescaled_not_subsampled_CpG_chr_mlRho + not_rescaled_not_subsampled_CpG_chr_mlRho + rescaled_subsampled_CpG_chr_mlRho + not_rescaled_subsampled_CpG_chr_mlRho) elif len(sexchromosomeList) == 0: - rescaled_not_subsampled_not_CpG_mlRho = expand("results/historical/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.repma.all.mlRho.txt", + rescaled_not_subsampled_not_CpG_mlRho = expand("results/historical/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.repma.genome.mlRho.txt", sample=HIST_RESCALED_NOT_SUBSAMPLED_NOT_CpG_SAMPLES,) - not_rescaled_not_subsampled_not_CpG_mlRho = expand("results/historical/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.repma.all.mlRho.txt", + not_rescaled_not_subsampled_not_CpG_mlRho = expand("results/historical/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.repma.genome.mlRho.txt", sample=HIST_NOT_RESCALED_NOT_SUBSAMPLED_NOT_CpG_SAMPLES,) - rescaled_subsampled_not_CpG_mlRho = expand("results/historical/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.repma.all.mlRho.txt", + rescaled_subsampled_not_CpG_mlRho = expand("results/historical/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.repma.genome.mlRho.txt", sample=HIST_RESCALED_SUBSAMPLED_NOT_CpG_SAMPLES, DP=config["subsampling_depth"],) - not_rescaled_subsampled_not_CpG_mlRho = expand("results/historical/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.repma.all.mlRho.txt", + not_rescaled_subsampled_not_CpG_mlRho = expand("results/historical/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.repma.genome.mlRho.txt", sample=HIST_NOT_RESCALED_SUBSAMPLED_NOT_CpG_SAMPLES, DP=config["subsampling_depth"],) outlist += (rescaled_not_subsampled_not_CpG_mlRho + not_rescaled_not_subsampled_not_CpG_mlRho + rescaled_subsampled_not_CpG_mlRho + not_rescaled_subsampled_not_CpG_mlRho) if config["CpG_from_vcf"] == True: - rescaled_not_subsampled_CpG_mlRho = expand("results/historical/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.noCpG_vcf.repma.all.mlRho.txt", + rescaled_not_subsampled_CpG_mlRho = expand("results/historical/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.noCpG_vcf.repma.genome.mlRho.txt", sample=HIST_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES,) - not_rescaled_not_subsampled_CpG_mlRho = expand("results/historical/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.noCpG_vcf.repma.all.mlRho.txt", + not_rescaled_not_subsampled_CpG_mlRho = expand("results/historical/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.noCpG_vcf.repma.genome.mlRho.txt", sample=HIST_NOT_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES,) - rescaled_subsampled_CpG_mlRho = expand("results/historical/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.noCpG_vcf.repma.all.mlRho.txt", + rescaled_subsampled_CpG_mlRho = expand("results/historical/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.noCpG_vcf.repma.genome.mlRho.txt", sample=HIST_RESCALED_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"],) - not_rescaled_subsampled_CpG_mlRho = expand("results/historical/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.noCpG_vcf.repma.all.mlRho.txt", + not_rescaled_subsampled_CpG_mlRho = expand("results/historical/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.noCpG_vcf.repma.genome.mlRho.txt", sample=HIST_NOT_RESCALED_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"],) outlist += (rescaled_not_subsampled_CpG_mlRho + not_rescaled_not_subsampled_CpG_mlRho + rescaled_subsampled_CpG_mlRho + not_rescaled_subsampled_CpG_mlRho) elif config["CpG_from_reference"] == True: - rescaled_not_subsampled_CpG_mlRho = expand("results/historical/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.noCpG_ref.repma.all.mlRho.txt", + rescaled_not_subsampled_CpG_mlRho = expand("results/historical/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.noCpG_ref.repma.genome.mlRho.txt", sample=HIST_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES,) - not_rescaled_not_subsampled_CpG_mlRho = expand("results/historical/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.noCpG_ref.repma.all.mlRho.txt", + not_rescaled_not_subsampled_CpG_mlRho = expand("results/historical/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.noCpG_ref.repma.genome.mlRho.txt", sample=HIST_NOT_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES,) - rescaled_subsampled_CpG_mlRho = expand("results/historical/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.noCpG_ref.repma.all.mlRho.txt", + rescaled_subsampled_CpG_mlRho = expand("results/historical/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.noCpG_ref.repma.genome.mlRho.txt", sample=HIST_RESCALED_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"],) - not_rescaled_subsampled_CpG_mlRho = expand("results/historical/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.noCpG_ref.repma.all.mlRho.txt", + not_rescaled_subsampled_CpG_mlRho = expand("results/historical/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.noCpG_ref.repma.genome.mlRho.txt", sample=HIST_NOT_RESCALED_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"],) outlist += (rescaled_not_subsampled_CpG_mlRho + not_rescaled_not_subsampled_CpG_mlRho + rescaled_subsampled_CpG_mlRho + not_rescaled_subsampled_CpG_mlRho) elif config["CpG_from_vcf_and_reference"] == True: - rescaled_not_subsampled_CpG_mlRho = expand("results/historical/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.noCpG_vcfref.repma.all.mlRho.txt", + rescaled_not_subsampled_CpG_mlRho = expand("results/historical/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.noCpG_vcfref.repma.genome.mlRho.txt", sample=HIST_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES,) - not_rescaled_not_subsampled_CpG_mlRho = expand("results/historical/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.noCpG_vcfref.repma.all.mlRho.txt", + not_rescaled_not_subsampled_CpG_mlRho = expand("results/historical/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.noCpG_vcfref.repma.genome.mlRho.txt", sample=HIST_NOT_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES,) - rescaled_subsampled_CpG_mlRho = expand("results/historical/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.noCpG_vcfref.repma.all.mlRho.txt", + rescaled_subsampled_CpG_mlRho = expand("results/historical/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.noCpG_vcfref.repma.genome.mlRho.txt", sample=HIST_RESCALED_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"],) - not_rescaled_subsampled_CpG_mlRho = expand("results/historical/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.noCpG_vcfref.repma.all.mlRho.txt", + not_rescaled_subsampled_CpG_mlRho = expand("results/historical/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.noCpG_vcfref.repma.genome.mlRho.txt", sample=HIST_NOT_RESCALED_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"],) outlist += (rescaled_not_subsampled_CpG_mlRho + not_rescaled_not_subsampled_CpG_mlRho + rescaled_subsampled_CpG_mlRho + not_rescaled_subsampled_CpG_mlRho) @@ -315,30 +315,30 @@ def all_mlRho_outputs(wildcards): DP=config["subsampling_depth"],) outlist += (not_subsampled_CpG_chr_mlRho + subsampled_CpG_chr_mlRho) elif len(sexchromosomeList) == 0: - not_subsampled_not_CpG_mlRho = expand("results/modern/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.repma.all.mlRho.txt", + not_subsampled_not_CpG_mlRho = expand("results/modern/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.repma.genome.mlRho.txt", sample=MODERN_NOT_SUBSAMPLED_NOT_CpG_SAMPLES,) - subsampled_not_CpG_mlRho = expand("results/modern/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.repma.all.mlRho.txt", + subsampled_not_CpG_mlRho = expand("results/modern/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.repma.genome.mlRho.txt", sample=MODERN_SUBSAMPLED_NOT_CpG_SAMPLES, DP=config["subsampling_depth"],) outlist += (not_subsampled_not_CpG_mlRho + subsampled_not_CpG_mlRho) if config["CpG_from_vcf"] == True: - not_subsampled_CpG_mlRho = expand("results/modern/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.noCpG_vcf.repma.all.mlRho.txt", + not_subsampled_CpG_mlRho = expand("results/modern/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.noCpG_vcf.repma.genome.mlRho.txt", sample=MODERN_NOT_SUBSAMPLED_CpG_SAMPLES,) - subsampled_CpG_mlRho = expand("results/modern/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.noCpG_vcf.repma.all.mlRho.txt", + subsampled_CpG_mlRho = expand("results/modern/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.noCpG_vcf.repma.genome.mlRho.txt", sample=MODERN_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"],) outlist += (not_subsampled_CpG_mlRho + subsampled_CpG_mlRho) elif config["CpG_from_reference"] == True: - not_subsampled_CpG_mlRho = expand("results/modern/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.noCpG_ref.repma.all.mlRho.txt", + not_subsampled_CpG_mlRho = expand("results/modern/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.noCpG_ref.repma.genome.mlRho.txt", sample=MODERN_NOT_SUBSAMPLED_CpG_SAMPLES,) - subsampled_CpG_mlRho = expand("results/modern/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.noCpG_ref.repma.all.mlRho.txt", + subsampled_CpG_mlRho = expand("results/modern/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.noCpG_ref.repma.genome.mlRho.txt", sample=MODERN_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"],) outlist += (not_subsampled_CpG_mlRho + subsampled_CpG_mlRho) elif config["CpG_from_vcf_and_reference"] == True: - not_subsampled_CpG_mlRho = expand("results/modern/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.noCpG_vcfref.repma.all.mlRho.txt", + not_subsampled_CpG_mlRho = expand("results/modern/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.noCpG_vcfref.repma.genome.mlRho.txt", sample=MODERN_NOT_SUBSAMPLED_CpG_SAMPLES,) - subsampled_CpG_mlRho = expand("results/modern/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.noCpG_vcfref.repma.all.mlRho.txt", + subsampled_CpG_mlRho = expand("results/modern/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.noCpG_vcfref.repma.genome.mlRho.txt", sample=MODERN_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"],) outlist += (not_subsampled_CpG_mlRho + subsampled_CpG_mlRho) @@ -481,17 +481,17 @@ rule mlRho_sexchr: """ -rule bam2pro_all: +rule bam2pro_genome: """Generate pro files from bam files""" """Note that the depth filter is recalculated for subsampled bam files, according to the target depth for subsampling""" input: bam=bam_file_mlRho, dp=depth_file_mlRho, - bed=bed_file_all_mlRho, + bed=bed_file_genome_mlRho, output: - pro=temp("results/{dataset}/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.all.pro"), + pro=temp("results/{dataset}/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.genome.pro"), log: - "results/logs/7_mlRho/{dataset}/" + REF_NAME + "/{sample}.{processed}_bam2pro_all.log", + "results/logs/7_mlRho/{dataset}/" + REF_NAME + "/{sample}.{processed}_bam2pro_genome.log", singularity: "docker://nbisweden/generode-mlrho" shell: @@ -510,22 +510,22 @@ rule bam2pro_all: """ -rule mlRho_all: +rule mlRho_genome: """Format the pro file and run mlRho""" """Note that the depth filter is recalculated for subsampled bam files, according to the target depth for subsampling""" input: - pro=rules.bam2pro_all.output, + pro=rules.bam2pro_genome.output, dp=depth_file_mlRho, output: - mlRho="results/{dataset}/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.all.mlRho.txt", - con=temp("results/{dataset}/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.all_profileDb.con"), - lik=temp("results/{dataset}/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.all_profileDb.lik"), - pos=temp("results/{dataset}/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.all_profileDb.pos"), - sum=temp("results/{dataset}/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.all_profileDb.sum"), + mlRho="results/{dataset}/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.genome.mlRho.txt", + con=temp("results/{dataset}/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.genome_profileDb.con"), + lik=temp("results/{dataset}/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.genome_profileDb.lik"), + pos=temp("results/{dataset}/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.genome_profileDb.pos"), + sum=temp("results/{dataset}/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.genome_profileDb.sum"), params: - db="results/{dataset}/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.all_profileDb", + db="results/{dataset}/mlRho/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.genome_profileDb", log: - "results/logs/7_mlRho/{dataset}/" + REF_NAME + "/{sample}.{processed}_mlRho_all.log", + "results/logs/7_mlRho/{dataset}/" + REF_NAME + "/{sample}.{processed}_mlRho_genome.log", singularity: "docker://nbisweden/generode-mlrho" shell: From 228d30ee5ff10bf8acbf6fb32f0fb5836fa200c4 Mon Sep 17 00:00:00 2001 From: verku Date: Thu, 5 Oct 2023 13:59:24 +0200 Subject: [PATCH 12/49] Add genome for wildcard chr and constrain fmiss wildcard so that the BCF file can either be filtered for sex chromosomes (autos) or not (genome) --- workflow/rules/9_merge_vcfs.smk | 46 ++++++++++++++++----------------- workflow/rules/common.smk | 3 ++- 2 files changed, 25 insertions(+), 24 deletions(-) diff --git a/workflow/rules/9_merge_vcfs.smk b/workflow/rules/9_merge_vcfs.smk index 9dc4096..62db515 100644 --- a/workflow/rules/9_merge_vcfs.smk +++ b/workflow/rules/9_merge_vcfs.smk @@ -164,14 +164,14 @@ def missingness_filtered_vcf_multiqc_inputs(wildcards): fmiss=config["f_missing"],) elif len(sexchromosomeList) == 0: if os.path.exists(config["historical_samples"]) and os.path.exists(config["modern_samples"]): - return expand("results/{dataset}/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.vcf.stats.txt", + return expand("results/{dataset}/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.genome.vcf.stats.txt", dataset=["all", "historical", "modern"], fmiss=config["f_missing"],) elif os.path.exists(config["historical_samples"]): - return expand("results/historical/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".historical.merged.biallelic.fmissing{fmiss}.vcf.stats.txt", + return expand("results/historical/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".historical.merged.biallelic.fmissing{fmiss}.genome.vcf.stats.txt", fmiss=config["f_missing"],) elif os.path.exists(config["modern_samples"]): - return expand("results/modern/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".modern.merged.biallelic.fmissing{fmiss}.vcf.stats.txt", + return expand("results/modern/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".modern.merged.biallelic.fmissing{fmiss}.genome.vcf.stats.txt", fmiss=config["f_missing"],) @@ -326,8 +326,8 @@ rule filter_vcf_missing: index=rules.filter_vcf_biallelic.output.index, multiqc=rules.biallelic_filtered_vcf_multiqc.output.stats, output: - vcf="results/all/vcf/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}.vcf.gz", - index="results/all/vcf/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}.vcf.gz.csi", + vcf="results/all/vcf/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}.genome.vcf.gz", + index="results/all/vcf/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}.genome.vcf.gz.csi", threads: 2 params: fmiss=config["f_missing"], @@ -382,12 +382,12 @@ rule remove_chromosomes: rule filtered_vcf2bed: """Convert the VCF file after removal of missing data (and optionally sex chromosomes) to BED file containing the remaining sites""" input: - vcf="results/all/vcf/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}{autos}.vcf.gz", - index="results/all/vcf/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}{autos}.vcf.gz.csi", + vcf="results/all/vcf/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}.{chr}.vcf.gz", + index="results/all/vcf/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}.{chr}.vcf.gz.csi", output: - bed="results/all/vcf/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}{autos}.bed", + bed="results/all/vcf/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}.{chr}.bed", log: - "results/logs/9_merge_vcfs/" + REF_NAME + ".all_fmissing{fmiss}{autos}_filtered_vcf2bed.log", + "results/logs/9_merge_vcfs/" + REF_NAME + ".all_fmissing{fmiss}.{chr}_filtered_vcf2bed.log", singularity: "docker://nbisweden/generode-bedtools-2.29.2" shell: @@ -398,14 +398,14 @@ rule filtered_vcf2bed: rule extract_historical_samples: input: - vcf="results/all/vcf/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}{autos}.vcf.gz", - index="results/all/vcf/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}{autos}.vcf.gz.csi", + vcf="results/all/vcf/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}.{chr}.vcf.gz", + index="results/all/vcf/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}.{chr}.vcf.gz.csi", bed=rules.filtered_vcf2bed.output.bed, output: - vcf="results/historical/vcf/" + REF_NAME + ".historical.merged.biallelic.fmissing{fmiss}{autos}.vcf.gz", - index="results/historical/vcf/" + REF_NAME + ".historical.merged.biallelic.fmissing{fmiss}{autos}.vcf.gz.csi", + vcf="results/historical/vcf/" + REF_NAME + ".historical.merged.biallelic.fmissing{fmiss}.{chr}.vcf.gz", + index="results/historical/vcf/" + REF_NAME + ".historical.merged.biallelic.fmissing{fmiss}.{chr}.vcf.gz.csi", log: - "results/logs/9_merge_vcfs/" + REF_NAME + ".historical_fmissing{fmiss}{autos}_extract_historical_samples.log", + "results/logs/9_merge_vcfs/" + REF_NAME + ".historical_fmissing{fmiss}.{chr}_extract_historical_samples.log", singularity: "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" params: @@ -431,14 +431,14 @@ rule extract_historical_samples: rule extract_modern_samples: input: - vcf="results/all/vcf/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}{autos}.vcf.gz", - index="results/all/vcf/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}{autos}.vcf.gz.csi", + vcf="results/all/vcf/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}.{chr}.vcf.gz", + index="results/all/vcf/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}.{chr}.vcf.gz.csi", bed=rules.filtered_vcf2bed.output.bed, output: - vcf="results/modern/vcf/" + REF_NAME + ".modern.merged.biallelic.fmissing{fmiss}{autos}.vcf.gz", - index="results/modern/vcf/" + REF_NAME + ".modern.merged.biallelic.fmissing{fmiss}{autos}.vcf.gz.csi", + vcf="results/modern/vcf/" + REF_NAME + ".modern.merged.biallelic.fmissing{fmiss}.{chr}.vcf.gz", + index="results/modern/vcf/" + REF_NAME + ".modern.merged.biallelic.fmissing{fmiss}.{chr}.vcf.gz.csi", log: - "results/logs/9_merge_vcfs/" + REF_NAME + ".modern_fmissing{fmiss}{autos}_extract_modern_samples.log", + "results/logs/9_merge_vcfs/" + REF_NAME + ".modern_fmissing{fmiss}.{chr}_extract_modern_samples.log", singularity: "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" params: @@ -465,12 +465,12 @@ rule extract_modern_samples: rule missingness_filtered_vcf_stats: """Obtain summary stats of merged vcf file""" input: - merged="results/{dataset}/vcf/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}{autos}.vcf.gz", - index="results/{dataset}/vcf/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}{autos}.vcf.gz.csi", + merged="results/{dataset}/vcf/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.vcf.gz", + index="results/{dataset}/vcf/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.vcf.gz.csi", output: - stats="results/{dataset}/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}{autos}.vcf.stats.txt", + stats="results/{dataset}/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", log: - "results/logs/9_merge_vcfs/" + REF_NAME + ".{dataset}_fmissing{fmiss}{autos}_missingness_filtered_vcf_stats.log", + "results/logs/9_merge_vcfs/" + REF_NAME + ".{dataset}_fmissing{fmiss}.{chr}_missingness_filtered_vcf_stats.log", singularity: "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" shell: diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index c8954a9..908b859 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -25,7 +25,8 @@ REF_NAME, REF_EXT = os.path.splitext(REF_FASTA) ### Global wildcard contraints that apply to all rules wildcard_constraints: sample="[A-Za-z0-9]+", - DP="[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?", # avoid extension with "rm.autosomes" when running mlRho + DP="[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?", # avoid extension with "rm.autos" when running mlRho + fmiss="[0-1].[0-9]+", # avoid extension with ".autos" when filtering the merged BCF file CpG_method="CpG_[vcfre]{3,6}", chunk="chunk[0-9]+", From f48cb493c24e0291d281ff3ead3dee60577b0cbd Mon Sep 17 00:00:00 2001 From: verku Date: Thu, 5 Oct 2023 17:11:00 +0200 Subject: [PATCH 13/49] Implement running all downstream analyses with full genome-wide BCF files or with autosomal-only BCF files --- workflow/rules/10_pca.smk | 62 +++++---- workflow/rules/11_ROH.smk | 68 +++++----- workflow/rules/12_snpEff.smk | 157 +++++++++++++--------- workflow/rules/13_GERP.smk | 253 +++++++++++++++++++++-------------- workflow/rules/common.smk | 5 + 5 files changed, 321 insertions(+), 224 deletions(-) diff --git a/workflow/rules/10_pca.smk b/workflow/rules/10_pca.smk index 0e5e013..3bd5193 100644 --- a/workflow/rules/10_pca.smk +++ b/workflow/rules/10_pca.smk @@ -3,22 +3,28 @@ # Code collecting output files from this part of the pipeline if os.path.exists(config["historical_samples"]): - all_outputs.append(expand("results/historical/pca/" + REF_NAME + ".historical.merged.biallelic.fmissing{fmiss}.pc1_pc2.pdf", - fmiss=config["f_missing"],)) - all_outputs.append(expand("results/historical/pca/" + REF_NAME + ".historical.merged.biallelic.fmissing{fmiss}.pc1_pc3.pdf", - fmiss=config["f_missing"],)) + all_outputs.append(expand("results/historical/pca/" + REF_NAME + ".historical.merged.biallelic.fmissing{fmiss}.{chr}.pc1_pc2.pdf", + fmiss=config["f_missing"], + chr=CHR,)) + all_outputs.append(expand("results/historical/pca/" + REF_NAME + ".historical.merged.biallelic.fmissing{fmiss}.{chr}.pc1_pc3.pdf", + fmiss=config["f_missing"], + chr=CHR,)) if os.path.exists(config["modern_samples"]): - all_outputs.append(expand("results/modern/pca/" + REF_NAME + ".modern.merged.biallelic.fmissing{fmiss}.pc1_pc2.pdf", - fmiss=config["f_missing"],)) - all_outputs.append(expand("results/modern/pca/" + REF_NAME + ".modern.merged.biallelic.fmissing{fmiss}.pc1_pc3.pdf", - fmiss=config["f_missing"],)) + all_outputs.append(expand("results/modern/pca/" + REF_NAME + ".modern.merged.biallelic.fmissing{fmiss}.{chr}.pc1_pc2.pdf", + fmiss=config["f_missing"], + chr=CHR,)) + all_outputs.append(expand("results/modern/pca/" + REF_NAME + ".modern.merged.biallelic.fmissing{fmiss}.{chr}.pc1_pc3.pdf", + fmiss=config["f_missing"], + chr=CHR,)) if os.path.exists(config["historical_samples"]) and os.path.exists(config["modern_samples"]): - all_outputs.append(expand("results/all/pca/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}.pc1_pc2.pdf", - fmiss=config["f_missing"],)) - all_outputs.append(expand("results/all/pca/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}.pc1_pc3.pdf", - fmiss=config["f_missing"],)) + all_outputs.append(expand("results/all/pca/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}.{chr}.pc1_pc2.pdf", + fmiss=config["f_missing"], + chr=CHR,)) + all_outputs.append(expand("results/all/pca/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}.{chr}.pc1_pc3.pdf", + fmiss=config["f_missing"], + chr=CHR,)) # snakemake rules @@ -30,18 +36,18 @@ localrules: rule vcf2plink_pca: """Convert VCF files to plink format (version 1.9) for PCA""" input: - vcf="results/{dataset}/vcf/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.vcf.gz", - index="results/{dataset}/vcf/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.vcf.gz.csi", + vcf="results/{dataset}/vcf/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.vcf.gz", + index="results/{dataset}/vcf/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.vcf.gz.csi", output: - bed=temp("results/{dataset}/pca/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.bed"), - bim=temp("results/{dataset}/pca/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.bim"), - fam=temp("results/{dataset}/pca/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.fam"), - nosex=temp("results/{dataset}/pca/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.nosex"), + bed=temp("results/{dataset}/pca/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.bed"), + bim=temp("results/{dataset}/pca/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.bim"), + fam=temp("results/{dataset}/pca/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.fam"), + nosex=temp("results/{dataset}/pca/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.nosex"), threads: 2 params: - bfile="results/{dataset}/pca/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}", + bfile="results/{dataset}/pca/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}", log: - "results/logs/10_pca/{dataset}/" + REF_NAME + ".{dataset}_fmissing{fmiss}_vcf2plink_pca.log", + "results/logs/10_pca/{dataset}/" + REF_NAME + ".{dataset}_fmissing{fmiss}.{chr}_vcf2plink_pca.log", singularity: "docker://quay.io/biocontainers/plink:1.90b6.12--heea4ae3_0" shell: @@ -58,12 +64,12 @@ rule plink_eigenvec: fam=rules.vcf2plink_pca.output.fam, nosex=rules.vcf2plink_pca.output.nosex, output: - eigenvec="results/{dataset}/pca/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.eigenvec", - eigenval="results/{dataset}/pca/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.eigenval", + eigenvec="results/{dataset}/pca/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.eigenvec", + eigenval="results/{dataset}/pca/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.eigenval", params: - bfile="results/{dataset}/pca/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}", + bfile="results/{dataset}/pca/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}", log: - "results/logs/10_pca/{dataset}/" + REF_NAME + ".{dataset}_fmissing{fmiss}_plink_eigenvec.log", + "results/logs/10_pca/{dataset}/" + REF_NAME + ".{dataset}_fmissing{fmiss}.{chr}_plink_eigenvec.log", singularity: "docker://quay.io/biocontainers/plink:1.90b6.12--heea4ae3_0" shell: @@ -85,11 +91,11 @@ rule plot_pc1_pc2: eigenvec=rules.plink_eigenvec.output.eigenvec, eigenval=rules.plink_eigenvec.output.eigenval, output: - pdf=report("results/{dataset}/pca/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.pc1_pc2.pdf", + pdf=report("results/{dataset}/pca/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.pc1_pc2.pdf", caption="../report/pca_plot_pc1_pc2.rst", category="PCA",), log: - "results/logs/10_pca/{dataset}/" + REF_NAME + ".{dataset}_fmissing{fmiss}_plot_pc1_pc2.log", + "results/logs/10_pca/{dataset}/" + REF_NAME + ".{dataset}_fmissing{fmiss}.{chr}_plot_pc1_pc2.log", script: "../scripts/pc1_vs_pc2_plot.py" @@ -100,10 +106,10 @@ rule plot_pc1_pc3: eigenvec=rules.plink_eigenvec.output.eigenvec, eigenval=rules.plink_eigenvec.output.eigenval, output: - pdf=report("results/{dataset}/pca/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.pc1_pc3.pdf", + pdf=report("results/{dataset}/pca/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.pc1_pc3.pdf", caption="../report/pca_plot_pc1_pc3.rst", category="PCA",), log: - "results/logs/10_pca/{dataset}/" + REF_NAME + ".{dataset}_fmissing{fmiss}_plot_pc1_pc3.log", + "results/logs/10_pca/{dataset}/" + REF_NAME + ".{dataset}_fmissing{fmiss}.{chr}_plot_pc1_pc3.log", script: "../scripts/pc1_vs_pc3_plot.py" diff --git a/workflow/rules/11_ROH.smk b/workflow/rules/11_ROH.smk index d0ebcd7..a32f6aa 100644 --- a/workflow/rules/11_ROH.smk +++ b/workflow/rules/11_ROH.smk @@ -3,8 +3,9 @@ # Code collecting output files from this part of the pipeline if os.path.exists(config["historical_samples"]) and os.path.exists(config["modern_samples"]): - all_outputs.append(expand("results/all/ROH/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}.hwe0.05.homsnp{homsnp}.homkb{homkb}.homwinsnp{homwinsnp}.homwinhet{homwinhet}.homwinmis{homwinmis}.homhet{homhet}.FROH_min_2Mb_plot.pdf", + all_outputs.append(expand("results/all/ROH/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}.{chr}.hwe0.05.homsnp{homsnp}.homkb{homkb}.homwinsnp{homwinsnp}.homwinhet{homwinhet}.homwinmis{homwinmis}.homhet{homhet}.FROH_min_2Mb_plot.pdf", fmiss=config["f_missing"], + chr=CHR, homsnp=config["homozyg-snp"], homkb=config["homozyg-kb"], homwinsnp=config["homozyg-window-snp"], @@ -13,8 +14,9 @@ if os.path.exists(config["historical_samples"]) and os.path.exists(config["moder homhet=config["homozyg-het"],)) elif os.path.exists(config["historical_samples"]): - all_outputs.append(expand("results/historical/ROH/" + REF_NAME + ".historical.merged.biallelic.fmissing{fmiss}.hwe0.05.homsnp{homsnp}.homkb{homkb}.homwinsnp{homwinsnp}.homwinhet{homwinhet}.homwinmis{homwinmis}.homhet{homhet}.FROH_min_2Mb_plot.pdf", + all_outputs.append(expand("results/historical/ROH/" + REF_NAME + ".historical.merged.biallelic.fmissing{fmiss}.{chr}.hwe0.05.homsnp{homsnp}.homkb{homkb}.homwinsnp{homwinsnp}.homwinhet{homwinhet}.homwinmis{homwinmis}.homhet{homhet}.FROH_min_2Mb_plot.pdf", fmiss=config["f_missing"], + chr=CHR, homsnp=config["homozyg-snp"], homkb=config["homozyg-kb"], homwinsnp=config["homozyg-window-snp"], @@ -23,8 +25,9 @@ elif os.path.exists(config["historical_samples"]): homhet=config["homozyg-het"],)) elif os.path.exists(config["modern_samples"]): - all_outputs.append(expand("results/modern/ROH/" + REF_NAME + ".modern.merged.biallelic.fmissing{fmiss}.hwe0.05.homsnp{homsnp}.homkb{homkb}.homwinsnp{homwinsnp}.homwinhet{homwinhet}.homwinmis{homwinmis}.homhet{homhet}.FROH_min_2Mb_plot.pdf", + all_outputs.append(expand("results/modern/ROH/" + REF_NAME + ".modern.merged.biallelic.fmissing{fmiss}.{chr}.hwe0.05.homsnp{homsnp}.homkb{homkb}.homwinsnp{homwinsnp}.homwinhet{homwinhet}.homwinmis{homwinmis}.homhet{homhet}.FROH_min_2Mb_plot.pdf", fmiss=config["f_missing"], + chr=CHR, homsnp=config["homozyg-snp"], homkb=config["homozyg-kb"], homwinsnp=config["homozyg-window-snp"], @@ -43,15 +46,15 @@ rule filter_vcf_hwe: Filter the VCF files for ROH analysis """ input: - vcf="results/{dataset}/vcf/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.vcf.gz", - index="results/{dataset}/vcf/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.vcf.gz.csi", + vcf="results/{dataset}/vcf/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.vcf.gz", + index="results/{dataset}/vcf/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.vcf.gz.csi", output: - vcf=temp("results/{dataset}/ROH/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.hwe0.05.recode.vcf"), + vcf=temp("results/{dataset}/ROH/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.hwe0.05.recode.vcf"), threads: 2 params: - out="results/{dataset}/ROH/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.hwe0.05", + out="results/{dataset}/ROH/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.hwe0.05", log: - "results/logs/11_ROH/{dataset}/" + REF_NAME + ".{dataset}_fmissing{fmiss}_filter_vcf_hwe.log", + "results/logs/11_ROH/{dataset}/" + REF_NAME + ".{dataset}_fmissing{fmiss}.{chr}_filter_vcf_hwe.log", singularity: "docker://biocontainers/vcftools:v0.1.16-1-deb_cv1" shell: @@ -64,10 +67,10 @@ rule compress_roh_vcf: input: vcf=rules.filter_vcf_hwe.output.vcf, output: - compressed=temp("results/{dataset}/ROH/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.hwe0.05.recode.vcf.gz"), - index=temp("results/{dataset}/ROH/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.hwe0.05.recode.vcf.gz.tbi"), + compressed=temp("results/{dataset}/ROH/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.hwe0.05.recode.vcf.gz"), + index=temp("results/{dataset}/ROH/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.hwe0.05.recode.vcf.gz.tbi"), log: - "results/logs/11_ROH/{dataset}/" + REF_NAME + ".{dataset}_fmissing{fmiss}_compress_roh_vcf.log", + "results/logs/11_ROH/{dataset}/" + REF_NAME + ".{dataset}_fmissing{fmiss}.{chr}_compress_roh_vcf.log", singularity: "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" shell: @@ -85,15 +88,15 @@ rule vcf2plink_hwe: vcf=rules.compress_roh_vcf.output.compressed, index=rules.compress_roh_vcf.output.index, output: - bed="results/{dataset}/ROH/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.hwe0.05.bed", - bim="results/{dataset}/ROH/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.hwe0.05.bim", - fam="results/{dataset}/ROH/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.hwe0.05.fam", - nosex="results/{dataset}/ROH/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.hwe0.05.nosex", + bed="results/{dataset}/ROH/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.hwe0.05.bed", + bim="results/{dataset}/ROH/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.hwe0.05.bim", + fam="results/{dataset}/ROH/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.hwe0.05.fam", + nosex="results/{dataset}/ROH/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.hwe0.05.nosex", threads: 2 params: - bfile="results/{dataset}/ROH/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.hwe0.05", + bfile="results/{dataset}/ROH/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.hwe0.05", log: - "results/logs/11_ROH/{dataset}/" + REF_NAME + ".{dataset}_fmissing{fmiss}_vcf2plink_hwe.log", + "results/logs/11_ROH/{dataset}/" + REF_NAME + ".{dataset}_fmissing{fmiss}.{chr}_vcf2plink_hwe.log", singularity: "docker://quay.io/biocontainers/plink:1.90b6.12--heea4ae3_0" shell: @@ -109,20 +112,20 @@ rule ROHs: input: rules.vcf2plink_hwe.output, output: - roh="results/{dataset}/ROH/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.hwe0.05.homsnp{homsnp}.homkb{homkb}.homwinsnp{homwinsnp}.homwinhet{homwinhet}.homwinmis{homwinmis}.homhet{homhet}.hom", - indiv="results/{dataset}/ROH/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.hwe0.05.homsnp{homsnp}.homkb{homkb}.homwinsnp{homwinsnp}.homwinhet{homwinhet}.homwinmis{homwinmis}.homhet{homhet}.hom.indiv", - summary="results/{dataset}/ROH/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.hwe0.05.homsnp{homsnp}.homkb{homkb}.homwinsnp{homwinsnp}.homwinhet{homwinhet}.homwinmis{homwinmis}.homhet{homhet}.hom.summary", + roh="results/{dataset}/ROH/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.hwe0.05.homsnp{homsnp}.homkb{homkb}.homwinsnp{homwinsnp}.homwinhet{homwinhet}.homwinmis{homwinmis}.homhet{homhet}.hom", + indiv="results/{dataset}/ROH/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.hwe0.05.homsnp{homsnp}.homkb{homkb}.homwinsnp{homwinsnp}.homwinhet{homwinhet}.homwinmis{homwinmis}.homhet{homhet}.hom.indiv", + summary="results/{dataset}/ROH/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.hwe0.05.homsnp{homsnp}.homkb{homkb}.homwinsnp{homwinsnp}.homwinhet{homwinhet}.homwinmis{homwinmis}.homhet{homhet}.hom.summary", params: - bfile="results/{dataset}/ROH/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.hwe0.05", + bfile="results/{dataset}/ROH/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.hwe0.05", homsnp=config["homozyg-snp"], # Min SNP count per ROH. homkb=config["homozyg-kb"], # Min length of ROH, with min SNP count. homwinsnp=config["homozyg-window-snp"], # Scanning window size. homwinhet=config["homozyg-window-het"], # Max hets in scanning window hit. homwinmis=config["homozyg-window-missing"], # Max missing calls in scanning window hit. homhet=config["homozyg-het"], # By default, a ROH can contain an unlimited number of heterozygous calls; you can impose a limit with --homozyg-het. (This flag was silently ignored by PLINK 1.07.) - roh="results/{dataset}/ROH/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.hwe0.05.homsnp{homsnp}.homkb{homkb}.homwinsnp{homwinsnp}.homwinhet{homwinhet}.homwinmis{homwinmis}.homhet{homhet}", + roh="results/{dataset}/ROH/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.hwe0.05.homsnp{homsnp}.homkb{homkb}.homwinsnp{homwinsnp}.homwinhet{homwinhet}.homwinmis{homwinmis}.homhet{homhet}", log: - "results/logs/11_ROH/{dataset}/" + REF_NAME + ".{dataset}_fmissing{fmiss}.homsnp{homsnp}.homkb{homkb}.homwinsnp{homwinsnp}.homwinhet{homwinhet}.homwinmis{homwinmis}.homhet{homhet}_ROHs.log", + "results/logs/11_ROH/{dataset}/" + REF_NAME + ".{dataset}_fmissing{fmiss}.{chr}.homsnp{homsnp}.homkb{homkb}.homwinsnp{homwinsnp}.homwinhet{homwinhet}.homwinmis{homwinmis}.homhet{homhet}_ROHs.log", singularity: "docker://quay.io/biocontainers/plink:1.90b6.12--heea4ae3_0" shell: @@ -137,9 +140,10 @@ rule ROHs: all_ROH_outputs = [] if os.path.exists(config["historical_samples"]) and os.path.exists(config["modern_samples"]): - both_ROH_outputs = expand("results/{dataset}/ROH/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.hwe0.05.homsnp{homsnp}.homkb{homkb}.homwinsnp{homwinsnp}.homwinhet{homwinhet}.homwinmis{homwinmis}.homhet{homhet}.{filetype}", + both_ROH_outputs = expand("results/{dataset}/ROH/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.hwe0.05.homsnp{homsnp}.homkb{homkb}.homwinsnp{homwinsnp}.homwinhet{homwinhet}.homwinmis{homwinmis}.homhet{homhet}.{filetype}", dataset=["modern", "historical"], fmiss=config["f_missing"], + chr=CHR, homsnp=config["homozyg-snp"], homkb=config["homozyg-kb"], homwinsnp=config["homozyg-window-snp"], @@ -150,8 +154,9 @@ if os.path.exists(config["historical_samples"]) and os.path.exists(config["moder all_ROH_outputs.append(both_ROH_outputs) elif os.path.exists(config["historical_samples"]): - historical_ROH_outputs = expand("results/historical/ROH/" + REF_NAME + ".historical.merged.biallelic.fmissing{fmiss}.hwe0.05.homsnp{homsnp}.homkb{homkb}.homwinsnp{homwinsnp}.homwinhet{homwinhet}.homwinmis{homwinmis}.homhet{homhet}.{filetype}", + historical_ROH_outputs = expand("results/historical/ROH/" + REF_NAME + ".historical.merged.biallelic.fmissing{fmiss}.{chr}.hwe0.05.homsnp{homsnp}.homkb{homkb}.homwinsnp{homwinsnp}.homwinhet{homwinhet}.homwinmis{homwinmis}.homhet{homhet}.{filetype}", fmiss=config["f_missing"], + chr=CHR, homsnp=config["homozyg-snp"], homkb=config["homozyg-kb"], homwinsnp=config["homozyg-window-snp"], @@ -162,8 +167,9 @@ elif os.path.exists(config["historical_samples"]): all_ROH_outputs.append(historical_ROH_outputs) elif os.path.exists(config["modern_samples"]): - modern_ROH_outputs = expand("results/modern/ROH/" + REF_NAME + ".modern.merged.biallelic.fmissing{fmiss}.hwe0.05.homsnp{homsnp}.homkb{homkb}.homwinsnp{homwinsnp}.homwinhet{homwinhet}.homwinmis{homwinmis}.homhet{homhet}.{filetype}", + modern_ROH_outputs = expand("results/modern/ROH/" + REF_NAME + ".modern.merged.biallelic.fmissing{fmiss}.{chr}.hwe0.05.homsnp{homsnp}.homkb{homkb}.homwinsnp{homwinsnp}.homwinhet{homwinhet}.homwinmis{homwinmis}.homhet{homhet}.{filetype}", fmiss=config["f_missing"], + chr=CHR, homsnp=config["homozyg-snp"], homkb=config["homozyg-kb"], homwinsnp=config["homozyg-window-snp"], @@ -182,11 +188,11 @@ rule FROH_min_2Mb_table: genomefile=REF_DIR + "/" + REF_NAME + ".genome", ROH=all_ROH_outputs, output: - table=report("results/{dataset}/ROH/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.hwe0.05.homsnp{homsnp}.homkb{homkb}.homwinsnp{homwinsnp}.homwinhet{homwinhet}.homwinmis{homwinmis}.homhet{homhet}.FROH_min_2Mb_table.txt", + table=report("results/{dataset}/ROH/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.hwe0.05.homsnp{homsnp}.homkb{homkb}.homwinsnp{homwinsnp}.homwinhet{homwinhet}.homwinmis{homwinmis}.homhet{homhet}.FROH_min_2Mb_table.txt", caption="../report/FROH_min_2Mb_table.rst", category="ROH",), log: - "results/logs/11_ROH/{dataset}/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.hwe0.05.homsnp{homsnp}.homkb{homkb}.homwinsnp{homwinsnp}.homwinhet{homwinhet}.homwinmis{homwinmis}.homhet{homhet}.FROH_min_2Mb_table.log", + "results/logs/11_ROH/{dataset}/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.hwe0.05.homsnp{homsnp}.homkb{homkb}.homwinsnp{homwinsnp}.homwinhet{homwinhet}.homwinmis{homwinmis}.homhet{homhet}.FROH_min_2Mb_table.log", script: "../scripts/FROH_min_2Mb_table.py" @@ -194,12 +200,12 @@ rule FROH_min_2Mb_table: rule FROH_min_2Mb_plot: """Plot the proportion of the genome in runs of homozygosity, for ROHs >= 2Mb""" input: - "results/{dataset}/ROH/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.hwe0.05.homsnp{homsnp}.homkb{homkb}.homwinsnp{homwinsnp}.homwinhet{homwinhet}.homwinmis{homwinmis}.homhet{homhet}.FROH_min_2Mb_table.txt", + "results/{dataset}/ROH/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.hwe0.05.homsnp{homsnp}.homkb{homkb}.homwinsnp{homwinsnp}.homwinhet{homwinhet}.homwinmis{homwinmis}.homhet{homhet}.FROH_min_2Mb_table.txt", output: - plot=report("results/{dataset}/ROH/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.hwe0.05.homsnp{homsnp}.homkb{homkb}.homwinsnp{homwinsnp}.homwinhet{homwinhet}.homwinmis{homwinmis}.homhet{homhet}.FROH_min_2Mb_plot.pdf", + plot=report("results/{dataset}/ROH/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.hwe0.05.homsnp{homsnp}.homkb{homkb}.homwinsnp{homwinsnp}.homwinhet{homwinhet}.homwinmis{homwinmis}.homhet{homhet}.FROH_min_2Mb_plot.pdf", caption="../report/FROH_min_2Mb_plot.rst", category="ROH",), log: - "results/logs/11_ROH/{dataset}/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.hwe0.05.homsnp{homsnp}.homkb{homkb}.homwinsnp{homwinsnp}.homwinhet{homwinhet}.homwinmis{homwinmis}.homhet{homhet}.FROH_min_2Mb_plot.log", + "results/logs/11_ROH/{dataset}/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.hwe0.05.homsnp{homsnp}.homkb{homkb}.homwinsnp{homwinsnp}.homwinhet{homwinhet}.homwinmis{homwinmis}.homhet{homhet}.FROH_min_2Mb_plot.log", script: "../scripts/FROH_min_2Mb_plot.py" diff --git a/workflow/rules/12_snpEff.smk b/workflow/rules/12_snpEff.smk index 0f60a96..fbfecfd 100644 --- a/workflow/rules/12_snpEff.smk +++ b/workflow/rules/12_snpEff.smk @@ -4,126 +4,153 @@ # Code collecting output files from this part of the pipeline if os.path.exists(config["historical_samples"]) and os.path.exists(config["modern_samples"]): - all_outputs.append(expand("results/all/snpEff/" + REF_NAME + ".all.fmissing{fmiss}.snpEff_variant_impact_plot.pdf", - fmiss=config["f_missing"],)) + all_outputs.append(expand("results/all/snpEff/" + REF_NAME + ".all.fmissing{fmiss}.{chr}.snpEff_variant_impact_plot.pdf", + fmiss=config["f_missing"], + chr=CHR,)) all_outputs.append("results/historical/snpEff/" + REF_NAME + "/multiqc/multiqc_report.html") all_outputs.append("results/modern/snpEff/" + REF_NAME + "/multiqc/multiqc_report.html") elif os.path.exists(config["historical_samples"]): - all_outputs.append(expand("results/historical/snpEff/" + REF_NAME + ".historical.fmissing{fmiss}.snpEff_variant_impact_plot.pdf", - fmiss=config["f_missing"],)) + all_outputs.append(expand("results/historical/snpEff/" + REF_NAME + ".historical.fmissing{fmiss}.{chr}.snpEff_variant_impact_plot.pdf", + fmiss=config["f_missing"], + chr=CHR,)) all_outputs.append("results/historical/snpEff/" + REF_NAME + "/multiqc/multiqc_report.html") elif os.path.exists(config["modern_samples"]): - all_outputs.append(expand("results/modern/snpEff/" + REF_NAME + ".modern.fmissing{fmiss}.snpEff_variant_impact_plot.pdf", - fmiss=config["f_missing"],)) + all_outputs.append(expand("results/modern/snpEff/" + REF_NAME + ".modern.fmissing{fmiss}.{chr}.snpEff_variant_impact_plot.pdf", + fmiss=config["f_missing"], + chr=CHR,)) all_outputs.append("results/modern/snpEff/" + REF_NAME + "/multiqc/multiqc_report.html") # Functions used by rules of this part of the pipeline def historical_snpEff_multiqc_inputs(wildcards): """Input for historical_snpEff_multiqc""" - rescaled_not_subsampled_not_CpG = expand("results/historical/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}_stats.csv", + rescaled_not_subsampled_not_CpG = expand("results/historical/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}_stats.csv", sample=HIST_RESCALED_NOT_SUBSAMPLED_NOT_CpG_SAMPLES, - fmiss=config["f_missing"],) - not_rescaled_not_subsampled_not_CpG = expand("results/historical/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}_stats.csv", + fmiss=config["f_missing"], + chr=CHR,) + not_rescaled_not_subsampled_not_CpG = expand("results/historical/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}_stats.csv", sample=HIST_NOT_RESCALED_NOT_SUBSAMPLED_NOT_CpG_SAMPLES, - fmiss=config["f_missing"],) - rescaled_subsampled_not_CpG = expand("results/historical/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}_stats.csv", + fmiss=config["f_missing"], + chr=CHR,) + rescaled_subsampled_not_CpG = expand("results/historical/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}_stats.csv", sample=HIST_RESCALED_SUBSAMPLED_NOT_CpG_SAMPLES, DP=config["subsampling_depth"], - fmiss=config["f_missing"],) - not_rescaled_subsampled_not_CpG = expand("results/historical/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}_stats.csv", + fmiss=config["f_missing"], + chr=CHR,) + not_rescaled_subsampled_not_CpG = expand("results/historical/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}_stats.csv", sample=HIST_NOT_RESCALED_SUBSAMPLED_NOT_CpG_SAMPLES, DP=config["subsampling_depth"], - fmiss=config["f_missing"],) + fmiss=config["f_missing"], + chr=CHR,) outlist = (rescaled_not_subsampled_not_CpG + not_rescaled_not_subsampled_not_CpG + rescaled_subsampled_not_CpG + not_rescaled_subsampled_not_CpG) if config["CpG_from_vcf"] == True: - rescaled_not_subsampled_CpG = expand("results/historical/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}_stats.csv", + rescaled_not_subsampled_CpG = expand("results/historical/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}_stats.csv", sample=HIST_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES, - fmiss=config["f_missing"],) - not_rescaled_not_subsampled_CpG = expand("results/historical/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}_stats.csv", + fmiss=config["f_missing"], + chr=CHR,) + not_rescaled_not_subsampled_CpG = expand("results/historical/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}_stats.csv", sample=HIST_NOT_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES, - fmiss=config["f_missing"],) - rescaled_subsampled_CpG = expand("results/historical/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}_stats.csv", + fmiss=config["f_missing"], + chr=CHR,) + rescaled_subsampled_CpG = expand("results/historical/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}_stats.csv", sample=HIST_RESCALED_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"], - fmiss=config["f_missing"],) - not_rescaled_subsampled_CpG = expand("results/historical/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}_stats.csv", + fmiss=config["f_missing"], + chr=CHR,) + not_rescaled_subsampled_CpG = expand("results/historical/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}_stats.csv", sample=HIST_NOT_RESCALED_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"], - fmiss=config["f_missing"],) + fmiss=config["f_missing"], + chr=CHR,) outlist += (rescaled_not_subsampled_CpG + not_rescaled_not_subsampled_CpG + rescaled_subsampled_CpG + not_rescaled_subsampled_CpG) elif config["CpG_from_reference"] == True: - rescaled_not_subsampled_CpG = expand("results/historical/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}_stats.csv", + rescaled_not_subsampled_CpG = expand("results/historical/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}_stats.csv", sample=HIST_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES, - fmiss=config["f_missing"],) - not_rescaled_not_subsampled_CpG = expand("results/historical/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}_stats.csv", + fmiss=config["f_missing"], + chr=CHR,) + not_rescaled_not_subsampled_CpG = expand("results/historical/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}_stats.csv", sample=HIST_NOT_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES, - fmiss=config["f_missing"],) - rescaled_subsampled_CpG = expand("results/historical/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}_stats.csv", + fmiss=config["f_missing"], + chr=CHR,) + rescaled_subsampled_CpG = expand("results/historical/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}_stats.csv", sample=HIST_RESCALED_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"], - fmiss=config["f_missing"],) - not_rescaled_subsampled_CpG = expand("results/historical/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}_stats.csv", + fmiss=config["f_missing"], + chr=CHR,) + not_rescaled_subsampled_CpG = expand("results/historical/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}_stats.csv", sample=HIST_NOT_RESCALED_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"], - fmiss=config["f_missing"],) + fmiss=config["f_missing"], + chr=CHR,) outlist += (rescaled_not_subsampled_CpG + not_rescaled_not_subsampled_CpG + rescaled_subsampled_CpG + not_rescaled_subsampled_CpG) elif config["CpG_from_vcf_and_reference"] == True: - rescaled_not_subsampled_CpG = expand("results/historical/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}_stats.csv", + rescaled_not_subsampled_CpG = expand("results/historical/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}_stats.csv", sample=HIST_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES, - fmiss=config["f_missing"],) - not_rescaled_not_subsampled_CpG = expand("results/historical/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}_stats.csv", + fmiss=config["f_missing"], + chr=CHR,) + not_rescaled_not_subsampled_CpG = expand("results/historical/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}_stats.csv", sample=HIST_NOT_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES, - fmiss=config["f_missing"],) - rescaled_subsampled_CpG = expand("results/historical/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}_stats.csv", + fmiss=config["f_missing"], + chr=CHR,) + rescaled_subsampled_CpG = expand("results/historical/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}_stats.csv", sample=HIST_RESCALED_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"], - fmiss=config["f_missing"],) - not_rescaled_subsampled_CpG = expand("results/historical/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}_stats.csv", + fmiss=config["f_missing"], + chr=CHR,) + not_rescaled_subsampled_CpG = expand("results/historical/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}_stats.csv", sample=HIST_NOT_RESCALED_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"], - fmiss=config["f_missing"],) + fmiss=config["f_missing"], + chr=CHR,) outlist += (rescaled_not_subsampled_CpG + not_rescaled_not_subsampled_CpG + rescaled_subsampled_CpG + not_rescaled_subsampled_CpG) return outlist def modern_snpEff_multiqc_inputs(wildcards): """Input for modern_snpEff_multiqc""" - not_subsampled_not_CpG = expand("results/modern/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}_stats.csv", + not_subsampled_not_CpG = expand("results/modern/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}_stats.csv", sample=MODERN_NOT_SUBSAMPLED_NOT_CpG_SAMPLES, - fmiss=config["f_missing"],) - subsampled_not_CpG = expand("results/modern/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}_stats.csv", + fmiss=config["f_missing"], + chr=CHR,) + subsampled_not_CpG = expand("results/modern/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}_stats.csv", sample=MODERN_SUBSAMPLED_NOT_CpG_SAMPLES, DP=config["subsampling_depth"], - fmiss=config["f_missing"],) + fmiss=config["f_missing"], + chr=CHR,) outlist = (not_subsampled_not_CpG + subsampled_not_CpG) if config["CpG_from_vcf"] == True: - not_subsampled_CpG = expand("results/modern/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}_stats.csv", + not_subsampled_CpG = expand("results/modern/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}_stats.csv", sample=MODERN_NOT_SUBSAMPLED_CpG_SAMPLES, - fmiss=config["f_missing"],) - subsampled_CpG = expand("results/modern/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}_stats.csv", + fmiss=config["f_missing"], + chr=CHR,) + subsampled_CpG = expand("results/modern/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}_stats.csv", sample=MODERN_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"], - fmiss=config["f_missing"],) + fmiss=config["f_missing"], + chr=CHR,) outlist += (not_subsampled_CpG + subsampled_CpG) elif config["CpG_from_reference"] == True: - not_subsampled_CpG = expand("results/modern/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}_stats.csv", + not_subsampled_CpG = expand("results/modern/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}_stats.csv", sample=MODERN_NOT_SUBSAMPLED_CpG_SAMPLES, - fmiss=config["f_missing"],) - subsampled_CpG = expand("results/modern/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}_stats.csv", + fmiss=config["f_missing"], + chr=CHR,) + subsampled_CpG = expand("results/modern/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}_stats.csv", sample=MODERN_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"], - fmiss=config["f_missing"],) + fmiss=config["f_missing"], + chr=CHR,) outlist += (not_subsampled_CpG + subsampled_CpG) elif config["CpG_from_vcf_and_reference"] == True: - not_subsampled_CpG = expand("results/modern/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}_stats.csv", + not_subsampled_CpG = expand("results/modern/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}_stats.csv", sample=MODERN_NOT_SUBSAMPLED_CpG_SAMPLES, - fmiss=config["f_missing"],) - subsampled_CpG = expand("results/modern/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}_stats.csv", + fmiss=config["f_missing"], + chr=CHR,) + subsampled_CpG = expand("results/modern/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}_stats.csv", sample=MODERN_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"], - fmiss=config["f_missing"],) + fmiss=config["f_missing"], + chr=CHR,) outlist += (not_subsampled_CpG + subsampled_CpG) return outlist @@ -244,10 +271,10 @@ rule filter_biallelic_missing_vcf_snpEff: bed=rules.filtered_vcf2bed.output.bed, genomefile=rules.genome_file.output.genomefile, output: - filtered=temp("results/{dataset}/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.gz"), + filtered=temp("results/{dataset}/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.gz"), threads: 6 log: - "results/logs/12_snpEff/{dataset}/" + REF_NAME + "/{sample}.{processed}_fmissing{fmiss}_filter_biallelic_missing_vcf.log", + "results/logs/12_snpEff/{dataset}/" + REF_NAME + "/{sample}.{processed}_fmissing{fmiss}.{chr}_filter_biallelic_missing_vcf.log", singularity: "docker://nbisweden/generode-bedtools-2.29.2" shell: @@ -263,16 +290,16 @@ rule annotate_vcf: db=rules.build_snpEff_db.output.db, config=rules.update_snpEff_config.output.config, output: - ann="results/{dataset}/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.ann.vcf", - csv="results/{dataset}/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}_stats.csv", - html="results/{dataset}/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}_stats.html", + ann="results/{dataset}/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.ann.vcf", + csv="results/{dataset}/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}_stats.csv", + html="results/{dataset}/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}_stats.html", threads: 1 params: ref_name=REF_NAME, abs_config=lambda wildcards, input: os.path.abspath(input.config), abs_data_dir=os.path.abspath(GTF_DIR + "/snpEff/data/"), log: - "results/logs/12_snpEff/{dataset}/" + REF_NAME + "/{sample}.{processed}_fmissing{fmiss}_annotate_vcf.log", + "results/logs/12_snpEff/{dataset}/" + REF_NAME + "/{sample}.{processed}_fmissing{fmiss}.{chr}_annotate_vcf.log", singularity: "docker://quay.io/biocontainers/snpeff:4.3.1t--3" shell: @@ -330,11 +357,11 @@ rule snpEff_variant_impact_table: input: all_snpEff_outputs, output: - table=report("results/{dataset}/snpEff/" + REF_NAME + ".{dataset}.fmissing{fmiss}.snpEff_variant_impact_table.txt", + table=report("results/{dataset}/snpEff/" + REF_NAME + ".{dataset}.fmissing{fmiss}.{chr}.snpEff_variant_impact_table.txt", caption="../report/snpEff_variant_impact_table.rst", category="snpEff",), log: - "results/logs/12_snpEff/{dataset}/" + REF_NAME + ".{dataset}.fmissing{fmiss}.snpEff_variant_impact_table.log", + "results/logs/12_snpEff/{dataset}/" + REF_NAME + ".{dataset}.fmissing{fmiss}.{chr}.snpEff_variant_impact_table.log", script: "../scripts/snpEff_variant_impact_table.py" @@ -342,12 +369,12 @@ rule snpEff_variant_impact_table: rule snpEff_variant_impact_plot: """Plot numbers of SNPs with high, moderate, low, and modifier impact, for modern and historical samples.""" input: - table= "results/{dataset}/snpEff/" + REF_NAME + ".{dataset}.fmissing{fmiss}.snpEff_variant_impact_table.txt", + table= "results/{dataset}/snpEff/" + REF_NAME + ".{dataset}.fmissing{fmiss}.{chr}.snpEff_variant_impact_table.txt", output: - plot=report("results/{dataset}/snpEff/" + REF_NAME + ".{dataset}.fmissing{fmiss}.snpEff_variant_impact_plot.pdf", + plot=report("results/{dataset}/snpEff/" + REF_NAME + ".{dataset}.fmissing{fmiss}.{chr}.snpEff_variant_impact_plot.pdf", caption="../report/snpEff_variant_impact_plot.rst", category="snpEff",), log: - "results/logs/12_snpEff/{dataset}/" + REF_NAME + ".{dataset}.fmissing{fmiss}.snpEff_variant_impact_plot.log", + "results/logs/12_snpEff/{dataset}/" + REF_NAME + ".{dataset}.fmissing{fmiss}.{chr}.snpEff_variant_impact_plot.log", script: "../scripts/snpEff_variant_impact_plot.py" diff --git a/workflow/rules/13_GERP.smk b/workflow/rules/13_GERP.smk index cdf9c17..3400f81 100644 --- a/workflow/rules/13_GERP.smk +++ b/workflow/rules/13_GERP.smk @@ -4,25 +4,29 @@ # Code collecting output files from this part of the pipeline all_outputs.append("results/gerp/" + REF_NAME + ".ancestral.rates.gerp.hist.pdf") + if os.path.exists(config["historical_samples"]) and os.path.exists(config["modern_samples"]): all_outputs.append(expand("results/gerp/{dataset}/" + REF_NAME + "/vcf/stats/multiqc/multiqc_report.html", dataset=["historical", "modern"])) - all_outputs.append(expand("results/gerp/all/" + REF_NAME + ".all.fmissing{fmiss}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_plot.pdf", + all_outputs.append(expand("results/gerp/all/" + REF_NAME + ".all.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_plot.pdf", fmiss=config["f_missing"], + chr=CHR, minGERP=config["min_gerp"], maxGERP=config["max_gerp"],)) elif os.path.exists(config["historical_samples"]): all_outputs.append("results/gerp/historical/" + REF_NAME + "/vcf/stats/multiqc/multiqc_report.html") - all_outputs.append(expand("results/gerp/historical/" + REF_NAME + ".historical.fmissing{fmiss}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_plot.pdf", + all_outputs.append(expand("results/gerp/historical/" + REF_NAME + ".historical.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_plot.pdf", fmiss=config["f_missing"], + chr=CHR, minGERP=config["min_gerp"], maxGERP=config["max_gerp"],)) elif os.path.exists(config["modern_samples"]): all_outputs.append("results/gerp/modern/" + REF_NAME + "/vcf/stats/multiqc/multiqc_report.html") - all_outputs.append(expand("results/gerp/modern/" + REF_NAME + ".modern.fmissing{fmiss}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_plot.pdf", + all_outputs.append(expand("results/gerp/modern/" + REF_NAME + ".modern.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_plot.pdf", fmiss=config["f_missing"], + chr=CHR, minGERP=config["min_gerp"], maxGERP=config["max_gerp"],)) @@ -30,107 +34,131 @@ elif os.path.exists(config["modern_samples"]): # Functions used by rules of this part of the pipeline def historical_biallelic_missing_filtered_vcf_gerp_multiqc_inputs(wildcards): """Input for historical_biallelic_missing_filtered_vcf_gerp_multiqc_inputs""" - rescaled_not_subsampled_not_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.stats.txt", + rescaled_not_subsampled_not_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=HIST_RESCALED_NOT_SUBSAMPLED_NOT_CpG_SAMPLES, - fmiss=config["f_missing"],) - not_rescaled_not_subsampled_not_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.stats.txt", + fmiss=config["f_missing"], + chr=CHR,) + not_rescaled_not_subsampled_not_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=HIST_NOT_RESCALED_NOT_SUBSAMPLED_NOT_CpG_SAMPLES, - fmiss=config["f_missing"],) - rescaled_subsampled_not_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.stats.txt", + fmiss=config["f_missing"], + chr=CHR,) + rescaled_subsampled_not_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=HIST_RESCALED_SUBSAMPLED_NOT_CpG_SAMPLES, DP=config["subsampling_depth"], - fmiss=config["f_missing"],) - not_rescaled_subsampled_not_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.stats.txt", + fmiss=config["f_missing"], + chr=CHR,) + not_rescaled_subsampled_not_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=HIST_NOT_RESCALED_SUBSAMPLED_NOT_CpG_SAMPLES, DP=config["subsampling_depth"], - fmiss=config["f_missing"],) + fmiss=config["f_missing"], + chr=CHR,) outlist = (rescaled_not_subsampled_not_CpG + not_rescaled_not_subsampled_not_CpG + rescaled_subsampled_not_CpG + not_rescaled_subsampled_not_CpG) if config["CpG_from_vcf"] == True: - rescaled_not_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.stats.txt", + rescaled_not_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=HIST_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES, - fmiss=config["f_missing"],) - not_rescaled_not_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.stats.txt", + fmiss=config["f_missing"], + chr=CHR,) + not_rescaled_not_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=HIST_NOT_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES, - fmiss=config["f_missing"],) - rescaled_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.stats.txt", + fmiss=config["f_missing"], + chr=CHR,) + rescaled_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=HIST_RESCALED_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"], - fmiss=config["f_missing"],) - not_rescaled_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.stats.txt", + fmiss=config["f_missing"], + chr=CHR,) + not_rescaled_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=HIST_NOT_RESCALED_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"], - fmiss=config["f_missing"],) + fmiss=config["f_missing"], + chr=CHR,) outlist += (rescaled_not_subsampled_CpG + not_rescaled_not_subsampled_CpG + rescaled_subsampled_CpG + not_rescaled_subsampled_CpG) elif config["CpG_from_reference"] == True: - rescaled_not_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.stats.txt", + rescaled_not_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=HIST_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES, - fmiss=config["f_missing"],) - not_rescaled_not_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.stats.txt", + fmiss=config["f_missing"], + chr=CHR,) + not_rescaled_not_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=HIST_NOT_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES, - fmiss=config["f_missing"],) - rescaled_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.stats.txt", + fmiss=config["f_missing"], + chr=CHR,) + rescaled_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=HIST_RESCALED_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"], - fmiss=config["f_missing"],) - not_rescaled_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.stats.txt", + fmiss=config["f_missing"], + chr=CHR,) + not_rescaled_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=HIST_NOT_RESCALED_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"], - fmiss=config["f_missing"],) + fmiss=config["f_missing"], + chr=CHR,) outlist += (rescaled_not_subsampled_CpG + not_rescaled_not_subsampled_CpG + rescaled_subsampled_CpG + not_rescaled_subsampled_CpG) elif config["CpG_from_vcf_and_reference"] == True: - rescaled_not_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.stats.txt", + rescaled_not_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=HIST_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES, - fmiss=config["f_missing"],) - not_rescaled_not_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.stats.txt", + fmiss=config["f_missing"], + chr=CHR,) + not_rescaled_not_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=HIST_NOT_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES, - fmiss=config["f_missing"],) - rescaled_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.stats.txt", + fmiss=config["f_missing"], + chr=CHR,) + rescaled_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=HIST_RESCALED_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"], - fmiss=config["f_missing"],) - not_rescaled_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.stats.txt", + fmiss=config["f_missing"], + chr=CHR,) + not_rescaled_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=HIST_NOT_RESCALED_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"], - fmiss=config["f_missing"],) + fmiss=config["f_missing"], + chr=CHR,) outlist += (rescaled_not_subsampled_CpG + not_rescaled_not_subsampled_CpG + rescaled_subsampled_CpG + not_rescaled_subsampled_CpG) return outlist def modern_biallelic_missing_filtered_vcf_gerp_multiqc_inputs(wildcards): """Input for modern_biallelic_missing_filtered_vcf_gerp_multiqc_inputs""" - not_subsampled_not_CpG = expand("results/gerp/modern/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.stats.txt", + not_subsampled_not_CpG = expand("results/gerp/modern/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=MODERN_NOT_SUBSAMPLED_NOT_CpG_SAMPLES, - fmiss=config["f_missing"],) - subsampled_not_CpG = expand("results/gerp/modern/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.stats.txt", + fmiss=config["f_missing"], + chr=CHR,) + subsampled_not_CpG = expand("results/gerp/modern/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=MODERN_SUBSAMPLED_NOT_CpG_SAMPLES, DP=config["subsampling_depth"], - fmiss=config["f_missing"],) + fmiss=config["f_missing"], + chr=CHR,) outlist = (not_subsampled_not_CpG + subsampled_not_CpG) if config["CpG_from_vcf"] == True: - not_subsampled_CpG = expand("results/gerp/modern/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.stats.txt", + not_subsampled_CpG = expand("results/gerp/modern/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=MODERN_NOT_SUBSAMPLED_CpG_SAMPLES, - fmiss=config["f_missing"],) - subsampled_CpG = expand("results/gerp/modern/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.stats.txt", + fmiss=config["f_missing"], + chr=CHR,) + subsampled_CpG = expand("results/gerp/modern/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=MODERN_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"], - fmiss=config["f_missing"],) + fmiss=config["f_missing"], + chr=CHR,) outlist += (not_subsampled_CpG + subsampled_CpG) elif config["CpG_from_reference"] == True: - not_subsampled_CpG = expand("results/gerp/modern/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.stats.txt", + not_subsampled_CpG = expand("results/gerp/modern/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=MODERN_NOT_SUBSAMPLED_CpG_SAMPLES, - fmiss=config["f_missing"],) - subsampled_CpG = expand("results/gerp/modern/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.stats.txt", + fmiss=config["f_missing"], + chr=CHR,) + subsampled_CpG = expand("results/gerp/modern/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=MODERN_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"], - fmiss=config["f_missing"],) + fmiss=config["f_missing"], + chr=CHR,) outlist += (not_subsampled_CpG + subsampled_CpG) elif config["CpG_from_vcf_and_reference"] == True: - not_subsampled_CpG = expand("results/gerp/modern/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.stats.txt", + not_subsampled_CpG = expand("results/gerp/modern/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=MODERN_NOT_SUBSAMPLED_CpG_SAMPLES, - fmiss=config["f_missing"],) - subsampled_CpG = expand("results/gerp/modern/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.stats.txt", + fmiss=config["f_missing"], + chr=CHR,) + subsampled_CpG = expand("results/gerp/modern/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=MODERN_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"], - fmiss=config["f_missing"],) + fmiss=config["f_missing"], + chr=CHR,) outlist += (not_subsampled_CpG + subsampled_CpG) return outlist @@ -138,150 +166,174 @@ def rel_load_table_inputs(wildcards): """Collect output files for pipeline report""" outlist = [] if os.path.exists(config["historical_samples"]): - rescaled_not_subsampled_not_CpG = expand("results/gerp/historical/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", + rescaled_not_subsampled_not_CpG = expand("results/gerp/historical/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", sample=HIST_RESCALED_NOT_SUBSAMPLED_NOT_CpG_SAMPLES, fmiss=config["f_missing"], + chr=CHR, minGERP=config["min_gerp"], maxGERP=config["max_gerp"],) - not_rescaled_not_subsampled_not_CpG = expand("results/gerp/historical/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", + not_rescaled_not_subsampled_not_CpG = expand("results/gerp/historical/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", sample=HIST_NOT_RESCALED_NOT_SUBSAMPLED_NOT_CpG_SAMPLES, fmiss=config["f_missing"], + chr=CHR, minGERP=config["min_gerp"], maxGERP=config["max_gerp"],) - rescaled_subsampled_not_CpG = expand("results/gerp/historical/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", + rescaled_subsampled_not_CpG = expand("results/gerp/historical/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", sample=HIST_RESCALED_SUBSAMPLED_NOT_CpG_SAMPLES, DP=config["subsampling_depth"], fmiss=config["f_missing"], + chr=CHR, minGERP=config["min_gerp"], maxGERP=config["max_gerp"],) - not_rescaled_subsampled_not_CpG = expand("results/gerp/historical/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", + not_rescaled_subsampled_not_CpG = expand("results/gerp/historical/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", sample=HIST_NOT_RESCALED_SUBSAMPLED_NOT_CpG_SAMPLES, DP=config["subsampling_depth"], fmiss=config["f_missing"], + chr=CHR, minGERP=config["min_gerp"], maxGERP=config["max_gerp"],) outlist += (rescaled_not_subsampled_not_CpG + not_rescaled_not_subsampled_not_CpG + rescaled_subsampled_not_CpG + not_rescaled_subsampled_not_CpG) if config["CpG_from_vcf"] == True: - rescaled_not_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", + rescaled_not_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", sample=HIST_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES, fmiss=config["f_missing"], + chr=CHR, minGERP=config["min_gerp"], maxGERP=config["max_gerp"],) - not_rescaled_not_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", + not_rescaled_not_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", sample=HIST_NOT_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES, fmiss=config["f_missing"], + chr=CHR, minGERP=config["min_gerp"], maxGERP=config["max_gerp"],) - rescaled_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", + rescaled_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", sample=HIST_RESCALED_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"], fmiss=config["f_missing"], + chr=CHR, minGERP=config["min_gerp"], maxGERP=config["max_gerp"],) - not_rescaled_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", + not_rescaled_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", sample=HIST_NOT_RESCALED_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"], fmiss=config["f_missing"], + chr=CHR, minGERP=config["min_gerp"], maxGERP=config["max_gerp"],) outlist += (rescaled_not_subsampled_CpG + not_rescaled_not_subsampled_CpG + rescaled_subsampled_CpG + not_rescaled_subsampled_CpG) elif config["CpG_from_reference"] == True: - rescaled_not_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", + rescaled_not_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", sample=HIST_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES, fmiss=config["f_missing"], + chr=CHR, minGERP=config["min_gerp"], maxGERP=config["max_gerp"],) - not_rescaled_not_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", + not_rescaled_not_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", sample=HIST_NOT_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES, fmiss=config["f_missing"], + chr=CHR, minGERP=config["min_gerp"], maxGERP=config["max_gerp"],) - rescaled_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", + rescaled_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", sample=HIST_RESCALED_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"], fmiss=config["f_missing"], + chr=CHR, minGERP=config["min_gerp"], maxGERP=config["max_gerp"],) - not_rescaled_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", + not_rescaled_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", sample=HIST_NOT_RESCALED_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"], fmiss=config["f_missing"], + chr=CHR, minGERP=config["min_gerp"], maxGERP=config["max_gerp"],) outlist += (rescaled_not_subsampled_CpG + not_rescaled_not_subsampled_CpG + rescaled_subsampled_CpG + not_rescaled_subsampled_CpG) elif config["CpG_from_vcf_and_reference"] == True: - rescaled_not_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", + rescaled_not_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", sample=HIST_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES, fmiss=config["f_missing"], + chr=CHR, minGERP=config["min_gerp"], maxGERP=config["max_gerp"],) - not_rescaled_not_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", + not_rescaled_not_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", sample=HIST_NOT_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES, fmiss=config["f_missing"], + chr=CHR, minGERP=config["min_gerp"], maxGERP=config["max_gerp"],) - rescaled_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", + rescaled_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", sample=HIST_RESCALED_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"], fmiss=config["f_missing"], + chr=CHR, minGERP=config["min_gerp"], maxGERP=config["max_gerp"],) - not_rescaled_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", + not_rescaled_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", sample=HIST_NOT_RESCALED_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"], fmiss=config["f_missing"], + chr=CHR, minGERP=config["min_gerp"], maxGERP=config["max_gerp"],) outlist += (rescaled_not_subsampled_CpG + not_rescaled_not_subsampled_CpG + rescaled_subsampled_CpG + not_rescaled_subsampled_CpG) if os.path.exists(config["modern_samples"]): - not_subsampled_not_CpG = expand("results/gerp/modern/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", + not_subsampled_not_CpG = expand("results/gerp/modern/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", sample=MODERN_NOT_SUBSAMPLED_NOT_CpG_SAMPLES, fmiss=config["f_missing"], + chr=CHR, minGERP=config["min_gerp"], maxGERP=config["max_gerp"],) - subsampled_not_CpG = expand("results/gerp/modern/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", + subsampled_not_CpG = expand("results/gerp/modern/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", sample=MODERN_SUBSAMPLED_NOT_CpG_SAMPLES, DP=config["subsampling_depth"], fmiss=config["f_missing"], + chr=CHR, minGERP=config["min_gerp"], maxGERP=config["max_gerp"],) outlist += (not_subsampled_not_CpG + subsampled_not_CpG) if config["CpG_from_vcf"] == True: - not_subsampled_CpG = expand("results/gerp/modern/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", + not_subsampled_CpG = expand("results/gerp/modern/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", sample=MODERN_NOT_SUBSAMPLED_CpG_SAMPLES, fmiss=config["f_missing"], + chr=CHR, minGERP=config["min_gerp"], maxGERP=config["max_gerp"],) - subsampled_CpG = expand("results/gerp/modern/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", + subsampled_CpG = expand("results/gerp/modern/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", sample=MODERN_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"], fmiss=config["f_missing"], + chr=CHR, minGERP=config["min_gerp"], maxGERP=config["max_gerp"],) outlist += (not_subsampled_CpG + subsampled_CpG) elif config["CpG_from_reference"] == True: - not_subsampled_CpG = expand("results/gerp/modern/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", + not_subsampled_CpG = expand("results/gerp/modern/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", sample=MODERN_NOT_SUBSAMPLED_CpG_SAMPLES, fmiss=config["f_missing"], + chr=CHR, minGERP=config["min_gerp"], maxGERP=config["max_gerp"],) - subsampled_CpG = expand("results/gerp/modern/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", + subsampled_CpG = expand("results/gerp/modern/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", sample=MODERN_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"], fmiss=config["f_missing"], + chr=CHR, minGERP=config["min_gerp"], maxGERP=config["max_gerp"],) outlist += (not_subsampled_CpG + subsampled_CpG) elif config["CpG_from_vcf_and_reference"] == True: - not_subsampled_CpG = expand("results/gerp/modern/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", + not_subsampled_CpG = expand("results/gerp/modern/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", sample=MODERN_NOT_SUBSAMPLED_CpG_SAMPLES, fmiss=config["f_missing"], + chr=CHR, minGERP=config["min_gerp"], maxGERP=config["max_gerp"],) - subsampled_CpG = expand("results/gerp/modern/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", + subsampled_CpG = expand("results/gerp/modern/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", sample=MODERN_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"], fmiss=config["f_missing"], + chr=CHR, minGERP=config["min_gerp"], maxGERP=config["max_gerp"],) outlist += (not_subsampled_CpG + subsampled_CpG) @@ -290,11 +342,11 @@ def rel_load_table_inputs(wildcards): def all_GERP_outputs(wildcards): """Collect output files for report""" if os.path.exists(config["historical_samples"]) and os.path.exists(config["modern_samples"]): - return "results/gerp/all/" + REF_NAME + ".all.fmissing{fmiss}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt" + return "results/gerp/all/" + REF_NAME + ".all.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt" elif os.path.exists(config["historical_samples"]): - return "results/gerp/historical/" + REF_NAME + ".historical.fmissing{fmiss}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt" + return "results/gerp/historical/" + REF_NAME + ".historical.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt" elif os.path.exists(config["modern_samples"]): - return "results/gerp/modern/" + REF_NAME + ".modern.fmissing{fmiss}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt" + return "results/gerp/modern/" + REF_NAME + ".modern.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt" # snakemake rules @@ -810,10 +862,10 @@ rule filter_biallelic_missing_vcf_gerp: bed=rules.filtered_vcf2bed.output.bed, genomefile=rules.genome_file.output.genomefile, output: - filtered=temp("results/gerp/{dataset}/" + REF_NAME + "/vcf/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.gz"), + filtered=temp("results/gerp/{dataset}/" + REF_NAME + "/vcf/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.gz"), threads: 6 log: - "results/logs/13_GERP/{dataset}/" + REF_NAME + "/vcf/{sample}.{processed}_fmissing{fmiss}_filter_biallelic_missing_vcf.log", + "results/logs/13_GERP/{dataset}/" + REF_NAME + "/vcf/{sample}.{processed}_fmissing{fmiss}.{chr}_filter_biallelic_missing_vcf.log", singularity: "docker://nbisweden/generode-bedtools-2.29.2" shell: @@ -825,11 +877,11 @@ rule filter_biallelic_missing_vcf_gerp: rule biallelic_missing_filtered_vcf_gerp_stats: """Obtain summary stats of filtered vcf file""" input: - filtered="results/gerp/{dataset}/" + REF_NAME + "/vcf/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.gz", + filtered="results/gerp/{dataset}/" + REF_NAME + "/vcf/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.gz", output: - stats="results/gerp/{dataset}/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.stats.txt", + stats="results/gerp/{dataset}/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", log: - "results/logs/13_GERP/{dataset}/" + REF_NAME + "/vcf/{sample}.{processed}_fmissing{fmiss}_biallelic_missing_filtered_vcf_stats.log", + "results/logs/13_GERP/{dataset}/" + REF_NAME + "/vcf/{sample}.{processed}_fmissing{fmiss}.{chr}_biallelic_missing_filtered_vcf_stats.log", singularity: "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" shell: @@ -879,13 +931,13 @@ rule modern_biallelic_missing_filtered_vcf_gerp_multiqc: rule split_vcf_files: """Split the VCF files into chunks for more resource-efficient merging with GERP results""" input: - vcf="results/gerp/{dataset}/" + REF_NAME + "/vcf/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.gz", + vcf="results/gerp/{dataset}/" + REF_NAME + "/vcf/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.gz", chunk_bed=REF_DIR + "/gerp/" + REF_NAME + "/split_bed_files/{chunk}.bed", genomefile=REF_DIR + "/" + REF_NAME + ".genome", output: - vcf_chunk=temp("results/gerp/chunks/" + REF_NAME + "/{dataset}/vcf/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chunk}.vcf.gz"), + vcf_chunk=temp("results/gerp/chunks/" + REF_NAME + "/{dataset}/vcf/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.{chunk}.vcf.gz"), log: - "results/logs/13_GERP/chunks/" + REF_NAME + "/{dataset}/vcf/{sample}.{processed}_fmissing{fmiss}.{chunk}_split_vcf_chunks.log", + "results/logs/13_GERP/chunks/" + REF_NAME + "/{dataset}/vcf/{sample}.{processed}_fmissing{fmiss}.{chr}.{chunk}_split_vcf_chunks.log", singularity: "docker://nbisweden/generode-bedtools-2.29.2" shell: @@ -920,9 +972,9 @@ rule gerp_derived_alleles: vcf=rules.split_vcf_files.output.vcf_chunk, chunk_win_bed=rules.split_chunk_bed_files.output.chunk_win_bed, output: - gerp_alleles_dir=temp(directory("results/gerp/chunks/" + REF_NAME + "/{dataset}/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chunk}_gerp_derived_alleles/")), + gerp_alleles_dir=temp(directory("results/gerp/chunks/" + REF_NAME + "/{dataset}/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.{chunk}_gerp_derived_alleles/")), log: - "results/logs/13_GERP/chunks/" + REF_NAME + "/{dataset}/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chunk}_gerp_derived_alleles.log", + "results/logs/13_GERP/chunks/" + REF_NAME + "/{dataset}/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.{chunk}_gerp_derived_alleles.log", threads: 4 shell: """ @@ -942,9 +994,9 @@ rule merge_gerp_alleles_per_chunk: gerp_alleles_dir=rules.gerp_derived_alleles.output.gerp_alleles_dir, chunk_win_bed=rules.split_chunk_bed_files.output.chunk_win_bed, output: - gerp_chunks_merged=temp("results/gerp/chunks/" + REF_NAME + "/{dataset}/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chunk}.fasta.parsed.rates.derived_alleles"), + gerp_chunks_merged=temp("results/gerp/chunks/" + REF_NAME + "/{dataset}/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.{chunk}.fasta.parsed.rates.derived_alleles"), log: - "results/logs/13_GERP/chunks/" + REF_NAME + "/{dataset}/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chunk}_merge_gerp_alleles_per_chunk.log", + "results/logs/13_GERP/chunks/" + REF_NAME + "/{dataset}/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.{chunk}_merge_gerp_alleles_per_chunk.log", threads: 4 run: chunk_windows = [] @@ -967,15 +1019,16 @@ rule merge_gerp_alleles_per_chunk: rule merge_gerp_alleles_gz: """Merge results into one file per sample.""" input: - gerp_chunks_merged=expand("results/gerp/chunks/" + REF_NAME + "/{{dataset}}/{{sample}}.merged.rmdup.merged.{{processed}}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chunk}.fasta.parsed.rates.derived_alleles", + gerp_chunks_merged=expand("results/gerp/chunks/" + REF_NAME + "/{{dataset}}/{{sample}}.merged.rmdup.merged.{{processed}}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.{chunk}.fasta.parsed.rates.derived_alleles", chunk=CHUNKS, fmiss=config["f_missing"], + chr=CHR, min_gerp=config["min_gerp"], max_gerp=config["max_gerp"],), output: - gerp_out="results/gerp/{dataset}/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.ancestral.rates.derived.alleles.gz", + gerp_out="results/gerp/{dataset}/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.ancestral.rates.derived.alleles.gz", log: - "results/logs/13_GERP/{dataset}/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.merge_gerp_alleles_gz.log", + "results/logs/13_GERP/{dataset}/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.merge_gerp_alleles_gz.log", threads: 4 shell: """ @@ -986,14 +1039,14 @@ rule merge_gerp_alleles_gz: rule relative_mutational_load_per_sample: """Calculate the relative mutational load per sample.""" input: - gerp_out="results/gerp/{dataset}/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.ancestral.rates.derived.alleles.gz", + gerp_out="results/gerp/{dataset}/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.ancestral.rates.derived.alleles.gz", output: - mut_load=temp("results/gerp/{dataset}/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt"), + mut_load=temp("results/gerp/{dataset}/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt"), params: min_gerp=config["min_gerp"], max_gerp=config["max_gerp"], log: - "results/logs/13_GERP/{dataset}/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.relative_mutational_load_table.gerp_{minGERP}_{maxGERP}.log", + "results/logs/13_GERP/{dataset}/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.relative_mutational_load_table.gerp_{minGERP}_{maxGERP}.log", threads: 2 shell: """ @@ -1006,11 +1059,11 @@ rule relative_mutational_load_table: input: rel_load_table_inputs, output: - mut_load=report("results/gerp/{dataset}/" + REF_NAME + ".{dataset}.fmissing{fmiss}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", + mut_load=report("results/gerp/{dataset}/" + REF_NAME + ".{dataset}.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_table.txt", caption="../report/relative_mutational_load_table.rst", category="GERP",), log: - "results/logs/13_GERP/{dataset}/" + REF_NAME + ".{dataset}.fmissing{fmiss}.relative_mutational_load_table.gerp_{minGERP}_{maxGERP}.log", + "results/logs/13_GERP/{dataset}/" + REF_NAME + ".{dataset}.fmissing{fmiss}.{chr}.relative_mutational_load_table.gerp_{minGERP}_{maxGERP}.log", threads: 4 script: "../scripts/gerp_rel_mut_load_table.py" @@ -1021,10 +1074,10 @@ rule relative_mutational_load_plot: input: all_GERP_outputs, output: - plot=report("results/gerp/{dataset}/" + REF_NAME + ".{dataset}.fmissing{fmiss}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_plot.pdf", + plot=report("results/gerp/{dataset}/" + REF_NAME + ".{dataset}.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_plot.pdf", caption="../report/relative_mutational_load_plot.rst", category="GERP",), log: - "results/logs/13_GERP/{dataset}/" + REF_NAME + ".{dataset}.fmissing{fmiss}.relative_mutational_load_plot.gerp_{minGERP}_{maxGERP}.log", + "results/logs/13_GERP/{dataset}/" + REF_NAME + ".{dataset}.fmissing{fmiss}.{chr}.relative_mutational_load_plot.gerp_{minGERP}_{maxGERP}.log", script: "../scripts/gerp_rel_mut_load_plot.py" diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 908b859..be6c9fa 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -372,6 +372,11 @@ if os.path.exists(config["sexchromosomes"]): for line in file: sexchromosomeList.append(line.strip()) +if len(sexchromosomeList) > 0: + CHR = "autos" +elif len(sexchromosomeList) == 0: + CHR = "genome" + ### # snpEff if config["snpEff"]: From d480ab728308bb34e84a6313b796ade6758dc7b6 Mon Sep 17 00:00:00 2001 From: verku Date: Thu, 5 Oct 2023 17:11:23 +0200 Subject: [PATCH 14/49] Implement shorter way to select for genome-wide or autosomal-only BCF file --- workflow/rules/9_merge_vcfs.smk | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/workflow/rules/9_merge_vcfs.smk b/workflow/rules/9_merge_vcfs.smk index 62db515..2059bdb 100644 --- a/workflow/rules/9_merge_vcfs.smk +++ b/workflow/rules/9_merge_vcfs.smk @@ -151,28 +151,19 @@ def merge_all_index_inputs(wildcards): def missingness_filtered_vcf_multiqc_inputs(wildcards): """Input for missingness_filtered_vcf_multiqc""" - if len(sexchromosomeList) > 0: - if os.path.exists(config["historical_samples"]) and os.path.exists(config["modern_samples"]): - return expand("results/{dataset}/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.autos.vcf.stats.txt", + if os.path.exists(config["historical_samples"]) and os.path.exists(config["modern_samples"]): + return expand("results/{dataset}/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", dataset=["all", "historical", "modern"], - fmiss=config["f_missing"],) - elif os.path.exists(config["historical_samples"]): - return expand("results/historical/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".historical.merged.biallelic.fmissing{fmiss}.autos.vcf.stats.txt", - fmiss=config["f_missing"],) - elif os.path.exists(config["modern_samples"]): - return expand("results/modern/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".modern.merged.biallelic.fmissing{fmiss}.autos.vcf.stats.txt", - fmiss=config["f_missing"],) - elif len(sexchromosomeList) == 0: - if os.path.exists(config["historical_samples"]) and os.path.exists(config["modern_samples"]): - return expand("results/{dataset}/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.genome.vcf.stats.txt", - dataset=["all", "historical", "modern"], - fmiss=config["f_missing"],) - elif os.path.exists(config["historical_samples"]): - return expand("results/historical/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".historical.merged.biallelic.fmissing{fmiss}.genome.vcf.stats.txt", - fmiss=config["f_missing"],) - elif os.path.exists(config["modern_samples"]): - return expand("results/modern/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".modern.merged.biallelic.fmissing{fmiss}.genome.vcf.stats.txt", - fmiss=config["f_missing"],) + fmiss=config["f_missing"], + chr=CHR,) + elif os.path.exists(config["historical_samples"]): + return expand("results/historical/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".historical.merged.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + fmiss=config["f_missing"], + chr=CHR,) + elif os.path.exists(config["modern_samples"]): + return expand("results/modern/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".modern.merged.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + fmiss=config["f_missing"], + chr=CHR,) # snakemake rules From 06edaa4704f6fcfdca2ab1b7a78bbdc9ab359f8b Mon Sep 17 00:00:00 2001 From: verku Date: Thu, 5 Oct 2023 17:11:30 +0200 Subject: [PATCH 15/49] Update documentation --- config/config.yaml | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 3844bce..cf28bf3 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -27,13 +27,12 @@ ref_path: "" # Relative path (from the main pipeline directory) to file listing # scaffolds/contigs linked to sex chromosomes (one scaffold/contig # name per line). -# If provided, the listed scaffolds/contigs are excluded from mlRho -# analyses (but see below how to run mlRho for autosomes and sex -# chromosomes separately from each other) and from the merged BCF file -# that is used for PCA, ROH, snpEff and GERP analyses. +# Is used to create BED files to run mlRho separately for autosomes +# and sex chromosomes or exclusively for autosomes, and/or to create +# autosome-only BCF files for PCA, ROH, snpEff and GERP analyses. # Can also be used to specify any other contigs/scaffolds, e.g. -# unplaced or short scaffolds, for removal from the mlRho analysis -# and BCF file. +# unplaced or short scaffolds, for removal from mlRho analysis +# and BCF files. # Leave empty ("") if identity of sex chromosomes is unknown and/or # if the pipeline should be run on all scaffolds/contigs of the genome. sexchromosomes: "" # for example, "config/chrX_candidate_scaffolds.txt" From 899949eb2b48f78202c8a1b0e8239093fc61d935 Mon Sep 17 00:00:00 2001 From: verku Date: Fri, 6 Oct 2023 09:27:37 +0200 Subject: [PATCH 16/49] Add test to run part of pipeline without sex chromosomes --- .github/workflows/main.yaml | 8 +-- .test/config/config_mitogenomes.yaml | 59 +++++++++--------- .test/config/config_mlRho_options.yaml | 59 +++++++++--------- .test/config/config_pca_roh.yaml | 61 ++++++++++--------- ...erp.yaml => config_snpeff_gerp_autos.yaml} | 59 +++++++++--------- .test/config/seq_to_exclude.txt | 1 + 6 files changed, 130 insertions(+), 117 deletions(-) rename .test/config/{config_snpeff_gerp.yaml => config_snpeff_gerp_autos.yaml} (93%) create mode 100644 .test/config/seq_to_exclude.txt diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 4c94f47..321479d 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -71,12 +71,12 @@ jobs: run: | snakemake --configfile .test/config/config_pca_roh.yaml -j 4 --cores 1 --use-singularity - - name: snpeff_gerp_dry + - name: snpeff_gerp_autos_dry shell: bash -l {0} run: | - snakemake -npr --configfile .test/config/config_snpeff_gerp.yaml -j 4 --cores 1 --use-singularity + snakemake -npr --configfile .test/config/config_snpeff_gerp_autos.yaml -j 4 --cores 1 --use-singularity - - name: snpeff_gerp + - name: snpeff_gerp_autos shell: bash -l {0} run: | - snakemake --configfile .test/config/config_snpeff_gerp.yaml -j 4 --cores 1 --use-singularity \ No newline at end of file + snakemake --configfile .test/config/config_snpeff_gerp_autos.yaml -j 4 --cores 1 --use-singularity \ No newline at end of file diff --git a/.test/config/config_mitogenomes.yaml b/.test/config/config_mitogenomes.yaml index 5019bd3..2b28388 100644 --- a/.test/config/config_mitogenomes.yaml +++ b/.test/config/config_mitogenomes.yaml @@ -21,6 +21,21 @@ # The file name will be reused by the pipeline and can have the file # name extensions *.fasta, *.fa or *.fna. ref_path: ".test/data/references/sumatran_rhino.fasta" + +# OPTIONAL: +# Relative path (from the main pipeline directory) to file listing +# scaffolds/contigs linked to sex chromosomes (one scaffold/contig +# name per line). +# Is used to create BED files to run mlRho separately for autosomes +# and sex chromosomes or exclusively for autosomes, and/or to create +# autosome-only BCF files for PCA, ROH, snpEff and GERP analyses. +# Can also be used to specify any other contigs/scaffolds, e.g. +# unplaced or short scaffolds, for removal from mlRho analysis +# and BCF files. +# Leave empty ("") if identity of sex chromosomes is unknown and/or +# if the pipeline should be run on all scaffolds/contigs of the genome. +sexchromosomes: "" # for example, "config/chrX_candidate_scaffolds.txt" +################################################################# ################################################################# @@ -278,26 +293,6 @@ CpG_samplenames: [] # Rules for BAM file processing for mlRho, and mlRho # ################################################################# -##### -# OPTIONAL: -# Generate BED files of autosomes and sex chromosomes for mlRho -# analyses, in case these should be analyzed separately from each -# other (see below for further options). -# Includes intersecting of the new chromosome-specific BED files -# with CpG- and repeat-masking BED files for downstream filtering. -autosome_sexchromosome_bed_files: False - -# Relative path (from the main pipeline directory) to file listing -# scaffolds/contigs linked to sex chromosomes (one scaffold/contig -# name per line). -# Leave empty ("") if identity of sex chromosomes is unknown and/or -# if mlRho should be run on all scaffolds/contigs of the genome. -# Keep the path to the file when running the next step (mlRho) -# separately for autosomes and sex chromosomes or only for autosomes. -sexchromosomes: "" # for example, "config/chrX_candidate_scaffolds.txt" -##### - - ##### # Run mlRho 2.9 on filtered BAM files. # Automatically generates a PDF file with a plot of genome-wide @@ -315,21 +310,22 @@ mlRho: False # and/or mlRho should be run on all contigs/scaffolds, # set mlRho_autosomes_sexchromosomes to False and do not provide # a path to a text file with sex-chromosomal contigs/scaffolds -# above when running mlRho. +# with the reference genome ("sexchromosomes") when running mlRho. # # 2) If the identity of sex-chromosomal contigs/scaffolds is known, # mlRho analyses can be run for autosomes and sex chromosomes # separately from each other. # In that case, set mlRho_autosomes_sexchromosomes to True and # provide the path to the file with sex-chromosomal contigs/scaffolds -# above when running mlRho. +# with the reference genome ("sexchromosomes") when running mlRho. # # 3) If the identity of sex-chromosomal contigs/scaffolds is known, -# sex-chromosomal contigs/scaffolds can be entirely excluded from +# sex-chromosomal contigs/scaffolds (or other contigs/scaffolds such +# as unplaced or short scaffolds) can be entirely excluded from # the analysis. # In that case, set mlRho_autosomes_sexchromosomes to False and # provide the path to the file with sex-chromosomal contigs/scaffolds -# above when running mlRho. +# with the reference genome ("sexchromosomes") when running mlRho. mlRho_autosomes_sexchromosomes: False ##### ################################################################# @@ -362,16 +358,23 @@ vcf_qual_repeat_filtering: False ##### # Merge BCF files into a BCF file containing all samples and remove all # sites that are not biallelic and with missing data across all samples -# up to a certain threshold as defined below. +# up to a certain threshold as defined below. +# If the path to a file with sex-chromosomal contigs/scaffolds is provided +# with the reference genome ("sexchromosomes"), these scaffolds/contigs are +# removed from the merged and filtered BCF file and all downstream analyses +# (optional). # Extract 1) all historical and 2) all modern samples from the merged and # filtered BCF file. -# Create a BED file of sites that remain after filtering across all samples -# to be used for downstream filtering of individual BCF files. +# Create a BED file of sites that remain after filtering and contig/scaffold +# removal across all samples to be used for downstream filtering of individual +# BCF files. merge_vcfs_per_dataset: False # Maximum allowed fraction of missing genotypes across all samples for a # site to be kept in the BCF and BED file, to ensure that the same sites -# are compared between historical and modern samples. +# are compared between historical and modern samples. Has to be a floating +# point number between 0.0 (no missing data allowed) and 1.0 (sites are +# allowed that are completely missing). f_missing: 0.1 # default: 0.1 (i.e. maximum 10% missing genotypes per site) ##### diff --git a/.test/config/config_mlRho_options.yaml b/.test/config/config_mlRho_options.yaml index 6876861..1f771b2 100644 --- a/.test/config/config_mlRho_options.yaml +++ b/.test/config/config_mlRho_options.yaml @@ -21,6 +21,21 @@ # The file name will be reused by the pipeline and can have the file # name extensions *.fasta, *.fa or *.fna. ref_path: ".test/data/references/sumatran_rhino.fasta" + +# OPTIONAL: +# Relative path (from the main pipeline directory) to file listing +# scaffolds/contigs linked to sex chromosomes (one scaffold/contig +# name per line). +# Is used to create BED files to run mlRho separately for autosomes +# and sex chromosomes or exclusively for autosomes, and/or to create +# autosome-only BCF files for PCA, ROH, snpEff and GERP analyses. +# Can also be used to specify any other contigs/scaffolds, e.g. +# unplaced or short scaffolds, for removal from mlRho analysis +# and BCF files. +# Leave empty ("") if identity of sex chromosomes is unknown and/or +# if the pipeline should be run on all scaffolds/contigs of the genome. +sexchromosomes: "" # for example, "config/chrX_candidate_scaffolds.txt" +################################################################# ################################################################# @@ -278,26 +293,6 @@ CpG_samplenames: ["S03", "S08"] # Rules for BAM file processing for mlRho, and mlRho # ################################################################# -##### -# OPTIONAL: -# Generate BED files of autosomes and sex chromosomes for mlRho -# analyses, in case these should be analyzed separately from each -# other (see below for further options). -# Includes intersecting of the new chromosome-specific BED files -# with CpG- and repeat-masking BED files for downstream filtering. -autosome_sexchromosome_bed_files: False - -# Relative path (from the main pipeline directory) to file listing -# scaffolds/contigs linked to sex chromosomes (one scaffold/contig -# name per line). -# Leave empty ("") if identity of sex chromosomes is unknown and/or -# if mlRho should be run on all scaffolds/contigs of the genome. -# Keep the path to the file when running the next step (mlRho) -# separately for autosomes and sex chromosomes or only for autosomes. -sexchromosomes: "" # for example, "config/chrX_candidate_scaffolds.txt" -##### - - ##### # Run mlRho 2.9 on filtered BAM files. # Automatically generates a PDF file with a plot of genome-wide @@ -315,21 +310,22 @@ mlRho: True # and/or mlRho should be run on all contigs/scaffolds, # set mlRho_autosomes_sexchromosomes to False and do not provide # a path to a text file with sex-chromosomal contigs/scaffolds -# above when running mlRho. +# with the reference genome ("sexchromosomes") when running mlRho. # # 2) If the identity of sex-chromosomal contigs/scaffolds is known, # mlRho analyses can be run for autosomes and sex chromosomes # separately from each other. # In that case, set mlRho_autosomes_sexchromosomes to True and # provide the path to the file with sex-chromosomal contigs/scaffolds -# above when running mlRho. +# with the reference genome ("sexchromosomes") when running mlRho. # # 3) If the identity of sex-chromosomal contigs/scaffolds is known, -# sex-chromosomal contigs/scaffolds can be entirely excluded from +# sex-chromosomal contigs/scaffolds (or other contigs/scaffolds such +# as unplaced or short scaffolds) can be entirely excluded from # the analysis. # In that case, set mlRho_autosomes_sexchromosomes to False and # provide the path to the file with sex-chromosomal contigs/scaffolds -# above when running mlRho. +# with the reference genome ("sexchromosomes") when running mlRho. mlRho_autosomes_sexchromosomes: False ##### ################################################################# @@ -362,16 +358,23 @@ vcf_qual_repeat_filtering: False ##### # Merge BCF files into a BCF file containing all samples and remove all # sites that are not biallelic and with missing data across all samples -# up to a certain threshold as defined below. +# up to a certain threshold as defined below. +# If the path to a file with sex-chromosomal contigs/scaffolds is provided +# with the reference genome ("sexchromosomes"), these scaffolds/contigs are +# removed from the merged and filtered BCF file and all downstream analyses +# (optional). # Extract 1) all historical and 2) all modern samples from the merged and # filtered BCF file. -# Create a BED file of sites that remain after filtering across all samples -# to be used for downstream filtering of individual BCF files. +# Create a BED file of sites that remain after filtering and contig/scaffold +# removal across all samples to be used for downstream filtering of individual +# BCF files. merge_vcfs_per_dataset: False # Maximum allowed fraction of missing genotypes across all samples for a # site to be kept in the BCF and BED file, to ensure that the same sites -# are compared between historical and modern samples. +# are compared between historical and modern samples. Has to be a floating +# point number between 0.0 (no missing data allowed) and 1.0 (sites are +# allowed that are completely missing). f_missing: 0.1 # default: 0.1 (i.e. maximum 10% missing genotypes per site) ##### diff --git a/.test/config/config_pca_roh.yaml b/.test/config/config_pca_roh.yaml index 40e2097..e085389 100644 --- a/.test/config/config_pca_roh.yaml +++ b/.test/config/config_pca_roh.yaml @@ -21,6 +21,21 @@ # The file name will be reused by the pipeline and can have the file # name extensions *.fasta, *.fa or *.fna. ref_path: ".test/data/references/sumatran_rhino.fasta" + +# OPTIONAL: +# Relative path (from the main pipeline directory) to file listing +# scaffolds/contigs linked to sex chromosomes (one scaffold/contig +# name per line). +# Is used to create BED files to run mlRho separately for autosomes +# and sex chromosomes or exclusively for autosomes, and/or to create +# autosome-only BCF files for PCA, ROH, snpEff and GERP analyses. +# Can also be used to specify any other contigs/scaffolds, e.g. +# unplaced or short scaffolds, for removal from mlRho analysis +# and BCF files. +# Leave empty ("") if identity of sex chromosomes is unknown and/or +# if the pipeline should be run on all scaffolds/contigs of the genome. +sexchromosomes: "" # for example, "config/chrX_candidate_scaffolds.txt" +################################################################# ################################################################# @@ -278,26 +293,6 @@ CpG_samplenames: [] # Rules for BAM file processing for mlRho, and mlRho # ################################################################# -##### -# OPTIONAL: -# Generate BED files of autosomes and sex chromosomes for mlRho -# analyses, in case these should be analyzed separately from each -# other (see below for further options). -# Includes intersecting of the new chromosome-specific BED files -# with CpG- and repeat-masking BED files for downstream filtering. -autosome_sexchromosome_bed_files: False - -# Relative path (from the main pipeline directory) to file listing -# scaffolds/contigs linked to sex chromosomes (one scaffold/contig -# name per line). -# Leave empty ("") if identity of sex chromosomes is unknown and/or -# if mlRho should be run on all scaffolds/contigs of the genome. -# Keep the path to the file when running the next step (mlRho) -# separately for autosomes and sex chromosomes or only for autosomes. -sexchromosomes: "" # for example, "config/chrX_candidate_scaffolds.txt" -##### - - ##### # Run mlRho 2.9 on filtered BAM files. # Automatically generates a PDF file with a plot of genome-wide @@ -315,21 +310,22 @@ mlRho: False # and/or mlRho should be run on all contigs/scaffolds, # set mlRho_autosomes_sexchromosomes to False and do not provide # a path to a text file with sex-chromosomal contigs/scaffolds -# above when running mlRho. +# with the reference genome ("sexchromosomes") when running mlRho. # # 2) If the identity of sex-chromosomal contigs/scaffolds is known, # mlRho analyses can be run for autosomes and sex chromosomes # separately from each other. # In that case, set mlRho_autosomes_sexchromosomes to True and # provide the path to the file with sex-chromosomal contigs/scaffolds -# above when running mlRho. +# with the reference genome ("sexchromosomes") when running mlRho. # # 3) If the identity of sex-chromosomal contigs/scaffolds is known, -# sex-chromosomal contigs/scaffolds can be entirely excluded from +# sex-chromosomal contigs/scaffolds (or other contigs/scaffolds such +# as unplaced or short scaffolds) can be entirely excluded from # the analysis. # In that case, set mlRho_autosomes_sexchromosomes to False and # provide the path to the file with sex-chromosomal contigs/scaffolds -# above when running mlRho. +# with the reference genome ("sexchromosomes") when running mlRho. mlRho_autosomes_sexchromosomes: False ##### ################################################################# @@ -362,17 +358,24 @@ vcf_qual_repeat_filtering: False ##### # Merge BCF files into a BCF file containing all samples and remove all # sites that are not biallelic and with missing data across all samples -# up to a certain threshold as defined below. +# up to a certain threshold as defined below. +# If the path to a file with sex-chromosomal contigs/scaffolds is provided +# with the reference genome ("sexchromosomes"), these scaffolds/contigs are +# removed from the merged and filtered BCF file and all downstream analyses +# (optional). # Extract 1) all historical and 2) all modern samples from the merged and # filtered BCF file. -# Create a BED file of sites that remain after filtering across all samples -# to be used for downstream filtering of individual BCF files. +# Create a BED file of sites that remain after filtering and contig/scaffold +# removal across all samples to be used for downstream filtering of individual +# BCF files. merge_vcfs_per_dataset: False # Maximum allowed fraction of missing genotypes across all samples for a # site to be kept in the BCF and BED file, to ensure that the same sites -# are compared between historical and modern samples. -f_missing: 0.5 # default: 0.1 (i.e. maximum 10% missing genotypes per site) +# are compared between historical and modern samples. Has to be a floating +# point number between 0.0 (no missing data allowed) and 1.0 (sites are +# allowed that are completely missing). +f_missing: 0.1 # default: 0.1 (i.e. maximum 10% missing genotypes per site) ##### ################################################################# diff --git a/.test/config/config_snpeff_gerp.yaml b/.test/config/config_snpeff_gerp_autos.yaml similarity index 93% rename from .test/config/config_snpeff_gerp.yaml rename to .test/config/config_snpeff_gerp_autos.yaml index 0c8de76..bfaddb4 100644 --- a/.test/config/config_snpeff_gerp.yaml +++ b/.test/config/config_snpeff_gerp_autos.yaml @@ -21,6 +21,21 @@ # The file name will be reused by the pipeline and can have the file # name extensions *.fasta, *.fa or *.fna. ref_path: ".test/data/references/sumatran_rhino.fasta" + +# OPTIONAL: +# Relative path (from the main pipeline directory) to file listing +# scaffolds/contigs linked to sex chromosomes (one scaffold/contig +# name per line). +# Is used to create BED files to run mlRho separately for autosomes +# and sex chromosomes or exclusively for autosomes, and/or to create +# autosome-only BCF files for PCA, ROH, snpEff and GERP analyses. +# Can also be used to specify any other contigs/scaffolds, e.g. +# unplaced or short scaffolds, for removal from mlRho analysis +# and BCF files. +# Leave empty ("") if identity of sex chromosomes is unknown and/or +# if the pipeline should be run on all scaffolds/contigs of the genome. +sexchromosomes: ".test/config/seq_to_exclude.txt" # for example, "config/chrX_candidate_scaffolds.txt" +################################################################# ################################################################# @@ -278,26 +293,6 @@ CpG_samplenames: [] # Rules for BAM file processing for mlRho, and mlRho # ################################################################# -##### -# OPTIONAL: -# Generate BED files of autosomes and sex chromosomes for mlRho -# analyses, in case these should be analyzed separately from each -# other (see below for further options). -# Includes intersecting of the new chromosome-specific BED files -# with CpG- and repeat-masking BED files for downstream filtering. -autosome_sexchromosome_bed_files: False - -# Relative path (from the main pipeline directory) to file listing -# scaffolds/contigs linked to sex chromosomes (one scaffold/contig -# name per line). -# Leave empty ("") if identity of sex chromosomes is unknown and/or -# if mlRho should be run on all scaffolds/contigs of the genome. -# Keep the path to the file when running the next step (mlRho) -# separately for autosomes and sex chromosomes or only for autosomes. -sexchromosomes: "" # for example, "config/chrX_candidate_scaffolds.txt" -##### - - ##### # Run mlRho 2.9 on filtered BAM files. # Automatically generates a PDF file with a plot of genome-wide @@ -315,21 +310,22 @@ mlRho: False # and/or mlRho should be run on all contigs/scaffolds, # set mlRho_autosomes_sexchromosomes to False and do not provide # a path to a text file with sex-chromosomal contigs/scaffolds -# above when running mlRho. +# with the reference genome ("sexchromosomes") when running mlRho. # # 2) If the identity of sex-chromosomal contigs/scaffolds is known, # mlRho analyses can be run for autosomes and sex chromosomes # separately from each other. # In that case, set mlRho_autosomes_sexchromosomes to True and # provide the path to the file with sex-chromosomal contigs/scaffolds -# above when running mlRho. +# with the reference genome ("sexchromosomes") when running mlRho. # # 3) If the identity of sex-chromosomal contigs/scaffolds is known, -# sex-chromosomal contigs/scaffolds can be entirely excluded from +# sex-chromosomal contigs/scaffolds (or other contigs/scaffolds such +# as unplaced or short scaffolds) can be entirely excluded from # the analysis. # In that case, set mlRho_autosomes_sexchromosomes to False and # provide the path to the file with sex-chromosomal contigs/scaffolds -# above when running mlRho. +# with the reference genome ("sexchromosomes") when running mlRho. mlRho_autosomes_sexchromosomes: False ##### ################################################################# @@ -362,16 +358,23 @@ vcf_qual_repeat_filtering: False ##### # Merge BCF files into a BCF file containing all samples and remove all # sites that are not biallelic and with missing data across all samples -# up to a certain threshold as defined below. +# up to a certain threshold as defined below. +# If the path to a file with sex-chromosomal contigs/scaffolds is provided +# with the reference genome ("sexchromosomes"), these scaffolds/contigs are +# removed from the merged and filtered BCF file and all downstream analyses +# (optional). # Extract 1) all historical and 2) all modern samples from the merged and # filtered BCF file. -# Create a BED file of sites that remain after filtering across all samples -# to be used for downstream filtering of individual BCF files. +# Create a BED file of sites that remain after filtering and contig/scaffold +# removal across all samples to be used for downstream filtering of individual +# BCF files. merge_vcfs_per_dataset: False # Maximum allowed fraction of missing genotypes across all samples for a # site to be kept in the BCF and BED file, to ensure that the same sites -# are compared between historical and modern samples. +# are compared between historical and modern samples. Has to be a floating +# point number between 0.0 (no missing data allowed) and 1.0 (sites are +# allowed that are completely missing). f_missing: 0.1 # default: 0.1 (i.e. maximum 10% missing genotypes per site) ##### diff --git a/.test/config/seq_to_exclude.txt b/.test/config/seq_to_exclude.txt new file mode 100644 index 0000000..add17d8 --- /dev/null +++ b/.test/config/seq_to_exclude.txt @@ -0,0 +1 @@ +Sc9M7eS_1280_HRSCAF_1917_split_75000 \ No newline at end of file From 5f1aae86e0912fcad8d8c04356d48e0b732b8983 Mon Sep 17 00:00:00 2001 From: verku Date: Fri, 6 Oct 2023 09:46:04 +0200 Subject: [PATCH 17/49] Remove *bai file from mlRho rule input to avoid triggering reruns of mappings because the *bai files are temporary files and because it is unnecessary --- workflow/rules/7_mlRho.smk | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/workflow/rules/7_mlRho.smk b/workflow/rules/7_mlRho.smk index 53bd40d..8e1fd26 100644 --- a/workflow/rules/7_mlRho.smk +++ b/workflow/rules/7_mlRho.smk @@ -27,8 +27,7 @@ def bam_file_mlRho(wildcards): bam = "results/modern/mapping/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.bam".format(sample=wildcards.sample) elif wildcards.sample in MODERN_SUBSAMPLED_SAMPLES: bam = "results/modern/mapping/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.bam".format(sample=wildcards.sample, DP=config["subsampling_depth"]) - bai = bam + ".bai" - return [bam, bai] + return [bam] def depth_file_mlRho(wildcards): """Select correct depth stats file for each sample""" From dcc027e674ec14aad4cc2e53afc1f3811cf86966 Mon Sep 17 00:00:00 2001 From: verku Date: Fri, 6 Oct 2023 09:53:46 +0200 Subject: [PATCH 18/49] Update fastqc to version 0.12.1 to increase default memory allocation --- workflow/rules/1.1_fastq_processing.smk | 10 +++++----- workflow/rules/13_GERP.smk | 2 +- workflow/rules/3.1_bam_rmdup_realign_indels.smk | 2 +- workflow/rules/3.2_historical_bam_mapDamage.smk | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/workflow/rules/1.1_fastq_processing.smk b/workflow/rules/1.1_fastq_processing.smk index d6c5a25..ad405e0 100644 --- a/workflow/rules/1.1_fastq_processing.smk +++ b/workflow/rules/1.1_fastq_processing.smk @@ -50,7 +50,7 @@ rule fastqc_historical_raw: "data/logs/1.1_fastq_processing/historical/{sample}_{index}_{lane}_R{nr}_fastqc_historical_raw.log", threads: 2 singularity: - "docker://biocontainers/fastqc:v0.11.9_cv7" + "docker://quay.io/biocontainers/fastqc:0.12.1--hdfd78af_0" shell: """ fastqc -o {params.dir} -t {threads} --extract {input.fastq} 2> {log} @@ -115,7 +115,7 @@ rule fastqc_modern_raw: "data/logs/1.1_fastq_processing/modern/{sample}_{index}_{lane}_R{nr}_fastqc_modern_raw.log", threads: 2 singularity: - "docker://biocontainers/fastqc:v0.11.9_cv7" + "docker://quay.io/biocontainers/fastqc:0.12.1--hdfd78af_0" shell: """ fastqc -o {params.dir} -t {threads} --extract {input.fastq} 2> {log} @@ -219,7 +219,7 @@ rule fastqc_historical_merged: "historical_fastq_trimmed_group" threads: 2 singularity: - "docker://biocontainers/fastqc:v0.11.9_cv7" + "docker://quay.io/biocontainers/fastqc:0.12.1--hdfd78af_0" shell: """ fastqc -o {params.dir} -t {threads} --extract {input} 2> {log} @@ -242,7 +242,7 @@ rule fastqc_historical_unmerged: "historical_fastq_trimmed_group" threads: 2 singularity: - "docker://biocontainers/fastqc:v0.11.9_cv7" + "docker://quay.io/biocontainers/fastqc:0.12.1--hdfd78af_0" shell: """ if [ -s {input} ] @@ -270,7 +270,7 @@ rule fastqc_modern_trimmed: "results/logs/1.1_fastq_processing/modern/{sample}_{index}_{lane}_R{nr}_trimmed_fastqc_modern_trimmed.log", threads: 2 singularity: - "docker://biocontainers/fastqc:v0.11.9_cv7" + "docker://quay.io/biocontainers/fastqc:0.12.1--hdfd78af_0" shell: """ fastqc -o {params.dir} -t {threads} --extract {input} 2> {log} diff --git a/workflow/rules/13_GERP.smk b/workflow/rules/13_GERP.smk index cdf9c17..e88d846 100644 --- a/workflow/rules/13_GERP.smk +++ b/workflow/rules/13_GERP.smk @@ -361,7 +361,7 @@ rule outgroup_fastqc: "results/logs/13_GERP/fastq/{gerpref}_outgroup_fastqc.log", threads: 2 singularity: - "docker://biocontainers/fastqc:v0.11.9_cv7" + "docker://quay.io/biocontainers/fastqc:0.12.1--hdfd78af_0" shell: """ fastqc -o {params.dir} -t {threads} --extract {input.fastq} 2> {log} diff --git a/workflow/rules/3.1_bam_rmdup_realign_indels.smk b/workflow/rules/3.1_bam_rmdup_realign_indels.smk index 6eba498..a876557 100644 --- a/workflow/rules/3.1_bam_rmdup_realign_indels.smk +++ b/workflow/rules/3.1_bam_rmdup_realign_indels.smk @@ -594,7 +594,7 @@ rule realigned_bam_fastqc: "results/logs/3.1_bam_rmdup_realign_indels/{dataset}/" + REF_NAME + "/{sample}_realigned_bam_fastqc.log", threads: 2 singularity: - "docker://biocontainers/fastqc:v0.11.9_cv7" + "docker://quay.io/biocontainers/fastqc:0.12.1--hdfd78af_0" shell: """ fastqc -o {params.dir} -t {threads} --extract {input.bam} 2> {log} diff --git a/workflow/rules/3.2_historical_bam_mapDamage.smk b/workflow/rules/3.2_historical_bam_mapDamage.smk index 6eccf10..3d4ad83 100644 --- a/workflow/rules/3.2_historical_bam_mapDamage.smk +++ b/workflow/rules/3.2_historical_bam_mapDamage.smk @@ -100,7 +100,7 @@ rule rescaled_bam_fastqc: "results/logs/3.2_historical_bam_mapDamage/" + REF_NAME + "/{sample}_rescaled_bam_fastqc.log", threads: 2 singularity: - "docker://biocontainers/fastqc:v0.11.9_cv7" + "docker://quay.io/biocontainers/fastqc:0.12.1--hdfd78af_0" shell: """ fastqc -o {params.dir} -t {threads} --extract {input.bam} 2> {log} From 81e24df4f7927d699483a695817ea512ffc34c38 Mon Sep 17 00:00:00 2001 From: verku Date: Fri, 6 Oct 2023 09:57:32 +0200 Subject: [PATCH 19/49] Fix typo --- workflow/rules/1.1_fastq_processing.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/1.1_fastq_processing.smk b/workflow/rules/1.1_fastq_processing.smk index ad405e0..5c4f46c 100644 --- a/workflow/rules/1.1_fastq_processing.smk +++ b/workflow/rules/1.1_fastq_processing.smk @@ -159,7 +159,7 @@ rule fastp_historical: R2_un=temp("results/historical/trimming/{sample}_{index}_{lane}_R2_unmerged.fastq.gz"), merged="results/historical/trimming/{sample}_{index}_{lane}_trimmed_merged.fastq.gz", html="results/historical/trimming/stats/{sample}_{index}_{lane}_fastp_report.html", - json=temp("results/modern/trimming/stats/{sample}_{index}_{lane}_fastp_report.json"), + json=temp("results/historical/trimming/stats/{sample}_{index}_{lane}_fastp_report.json"), params: readlength=config["hist_readlength"], report="fastp report for {sample}_{index}_{lane}", From 5ea42e16c0a79ad453d0742972c3badbefe07d47 Mon Sep 17 00:00:00 2001 From: verku Date: Fri, 6 Oct 2023 11:01:20 +0200 Subject: [PATCH 20/49] Update slurm configuration files with compute resources for new or changed rules --- config/slurm/cluster.yaml | 21 +++++++++------------ config/slurm/profile/config.yaml | 22 +++++++++------------- 2 files changed, 18 insertions(+), 25 deletions(-) diff --git a/config/slurm/cluster.yaml b/config/slurm/cluster.yaml index 5e7abe2..0178888 100644 --- a/config/slurm/cluster.yaml +++ b/config/slurm/cluster.yaml @@ -204,10 +204,19 @@ filter_vcf_biallelic: filter_vcf_missing: time: 1-00:00:00 cpus-per-task: 2 +remove_chromosomes: + time: 1-00:00:00 + cpus-per-task: 2 extract_historical_samples: time: 05:00:00 extract_modern_samples: time: 05:00:00 +repmasked_bcf2vcf: + time: 05:00:00 + cpus-per-task: 2 +filter_biallelic_missing_vcf: + time: 1-00:00:00 + cpus-per-task: 6 ### PCA vcf2plink_pca: time: 05:00:00 @@ -220,14 +229,8 @@ vcf2plink_hwe: time: 05:00:00 cpus-per-task: 2 ### snpEff -repmasked_bcf2vcf_snpEff: - time: 05:00:00 - cpus-per-task: 2 build_snpEff_db: time: 05:00:00 -filter_biallelic_missing_vcf_snpEff: - time: 1-00:00:00 - cpus-per-task: 6 annotate_vcf: time: 05:00:00 ### gerp @@ -264,12 +267,6 @@ merge_per_chunk: merge_gerp_gz: time: 1-00:00:00 cpus-per-task: 2 -repmasked_bcf2vcf_gerp: - time: 05:00:00 - cpus-per-task: 2 -filter_biallelic_missing_vcf_gerp: - time: 1-00:00:00 - cpus-per-task: 6 split_vcf_files: time: 05:00:00 gerp_derived_alleles: diff --git a/config/slurm/profile/config.yaml b/config/slurm/profile/config.yaml index f877080..46ecaf1 100644 --- a/config/slurm/profile/config.yaml +++ b/config/slurm/profile/config.yaml @@ -85,14 +85,14 @@ set-threads: - merge_all_vcfs=6 - filter_vcf_biallelic=2 - filter_vcf_missing=2 + - remove_chromosomes=2 + - repmasked_bcf2vcf=2 + - filter_biallelic_missing_vcf=6 ### PCA - vcf2plink_pca=2 ### runs of homozygosity - filter_vcf_hwe=2 - vcf2plink_hwe=2 -### snpEff - - repmasked_bcf2vcf_snpEff=2 - - filter_biallelic_missing_vcf_snpEff=6 ### gerp - outgroup_fastqc=2 - align2target=8 @@ -104,8 +104,6 @@ set-threads: - produce_contig_out=2 - merge_per_chunk=2 - merge_gerp_gz=2 - - repmasked_bcf2vcf_gerp=2 - - filter_biallelic_missing_vcf_gerp=6 - gerp_derived_alleles=2 - merge_gerp_alleles_per_chunk=4 - merge_gerp_alleles_gz=4 @@ -233,8 +231,14 @@ set-resources: - filter_vcf_biallelic:mem_mb=12800 - filter_vcf_missing:runtime=1440 - filter_vcf_missing:mem_mb=12800 + - remove_chromosomes:runtime=1440 + - remove_chromosomes:mem_mb=12800 - extract_historical_samples:runtime=300 - extract_modern_samples:runtime=300 + - repmasked_bcf2vcf:runtime=300 + - repmasked_bcf2vcf:mem_mb=12800 + - filter_biallelic_missing_vcf:runtime=1440 + - filter_biallelic_missing_vcf:mem_mb=38400 ### PCA - vcf2plink_pca:runtime=300 - vcf2plink_pca:mem_mb=12800 @@ -244,11 +248,7 @@ set-resources: - vcf2plink_hwe:runtime=300 - vcf2plink_hwe:mem_mb=12800 ### snpEff - - repmasked_bcf2vcf_snpEff:runtime=300 - - repmasked_bcf2vcf_snpEff:mem_mb=12800 - build_snpEff_db:runtime=300 - - filter_biallelic_missing_vcf_snpEff:runtime=1440 - - filter_biallelic_missing_vcf_snpEff:mem_mb=38400 - annotate_vcf:runtime=300 ### gerp - outgroups2fastq:runtime=1440 @@ -272,10 +272,6 @@ set-resources: - merge_per_chunk:mem_mb=12800 - merge_gerp_gz:runtime=1440 - merge_gerp_gz:mem_mb=12800 - - repmasked_bcf2vcf_gerp:runtime=300 - - repmasked_bcf2vcf_gerp:mem_mb=12800 - - filter_biallelic_missing_vcf_gerp:runtime=1440 - - filter_biallelic_missing_vcf_gerp:mem_mb=38400 - split_vcf_files:runtime=300 - gerp_derived_alleles:runtime=14400 - gerp_derived_alleles:mem_mb=12800 From 322937a407379ee94b50b2495240c8ef0c9383b4 Mon Sep 17 00:00:00 2001 From: verku Date: Fri, 6 Oct 2023 12:00:03 +0200 Subject: [PATCH 21/49] Move the filtering of individual BCF files from snpEff and GERP to 9_merge_vcfs.smk so that these rules are run only once if both steps are run --- workflow/rules/12_snpEff.smk | 42 +----- workflow/rules/13_GERP.smk | 230 +------------------------------- workflow/rules/9_merge_vcfs.smk | 224 +++++++++++++++++++++++++++++++ 3 files changed, 234 insertions(+), 262 deletions(-) diff --git a/workflow/rules/12_snpEff.smk b/workflow/rules/12_snpEff.smk index fbfecfd..38ee804 100644 --- a/workflow/rules/12_snpEff.smk +++ b/workflow/rules/12_snpEff.smk @@ -4,6 +4,8 @@ # Code collecting output files from this part of the pipeline if os.path.exists(config["historical_samples"]) and os.path.exists(config["modern_samples"]): + all_outputs.append(expand("results/{dataset}/vcf/" + REF_NAME + "/stats/multiqc/multiqc_report.html", + dataset=["historical", "modern"])) all_outputs.append(expand("results/all/snpEff/" + REF_NAME + ".all.fmissing{fmiss}.{chr}.snpEff_variant_impact_plot.pdf", fmiss=config["f_missing"], chr=CHR,)) @@ -11,12 +13,14 @@ if os.path.exists(config["historical_samples"]) and os.path.exists(config["moder all_outputs.append("results/modern/snpEff/" + REF_NAME + "/multiqc/multiqc_report.html") elif os.path.exists(config["historical_samples"]): + all_outputs.append("results/historical/vcf/" + REF_NAME + "/stats/multiqc/multiqc_report.html") all_outputs.append(expand("results/historical/snpEff/" + REF_NAME + ".historical.fmissing{fmiss}.{chr}.snpEff_variant_impact_plot.pdf", fmiss=config["f_missing"], chr=CHR,)) all_outputs.append("results/historical/snpEff/" + REF_NAME + "/multiqc/multiqc_report.html") elif os.path.exists(config["modern_samples"]): + all_outputs.append("results/modern/vcf/" + REF_NAME + "/stats/multiqc/multiqc_report.html") all_outputs.append(expand("results/modern/snpEff/" + REF_NAME + ".modern.fmissing{fmiss}.{chr}.snpEff_variant_impact_plot.pdf", fmiss=config["f_missing"], chr=CHR,)) @@ -247,46 +251,10 @@ rule build_snpEff_db: """ -rule repmasked_bcf2vcf_snpEff: - """Convert bcf format to vcf.gz for removal of sites""" - input: - bcf="results/{dataset}/vcf/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.bcf", - index="results/{dataset}/vcf/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.bcf.csi", - output: - vcf=temp("results/{dataset}/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.vcf.gz"), - log: - "results/logs/12_snpEff/{dataset}/" + REF_NAME + "/{sample}.{processed}_repmasked_bcf2vcf.log", - singularity: - "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" - shell: - """ - bcftools convert -O z -o {output.vcf} {input.bcf} 2> {log} - """ - - -rule filter_biallelic_missing_vcf_snpEff: - """Keep only sites with certain upper fraction missingness as specified in config file and sites that are biallelic across all samples from individual vcf files""" - input: - vcf=rules.repmasked_bcf2vcf_snpEff.output.vcf, - bed=rules.filtered_vcf2bed.output.bed, - genomefile=rules.genome_file.output.genomefile, - output: - filtered=temp("results/{dataset}/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.gz"), - threads: 6 - log: - "results/logs/12_snpEff/{dataset}/" + REF_NAME + "/{sample}.{processed}_fmissing{fmiss}.{chr}_filter_biallelic_missing_vcf.log", - singularity: - "docker://nbisweden/generode-bedtools-2.29.2" - shell: - """ - bedtools intersect -a {input.vcf} -b {input.bed} -header -sorted -g {input.genomefile} | bgzip -c > {output.filtered} 2> {log} - """ - - rule annotate_vcf: """Annotate the VCF files of each individual""" input: - vcf=rules.filter_biallelic_missing_vcf_snpEff.output.filtered, + vcf=rules.filter_biallelic_missing_vcf.output.filtered, db=rules.build_snpEff_db.output.db, config=rules.update_snpEff_config.output.config, output: diff --git a/workflow/rules/13_GERP.smk b/workflow/rules/13_GERP.smk index e9759c9..b38b06a 100644 --- a/workflow/rules/13_GERP.smk +++ b/workflow/rules/13_GERP.smk @@ -6,7 +6,7 @@ all_outputs.append("results/gerp/" + REF_NAME + ".ancestral.rates.gerp.hist.pdf") if os.path.exists(config["historical_samples"]) and os.path.exists(config["modern_samples"]): - all_outputs.append(expand("results/gerp/{dataset}/" + REF_NAME + "/vcf/stats/multiqc/multiqc_report.html", + all_outputs.append(expand("results/{dataset}/vcf/" + REF_NAME + "/stats/multiqc/multiqc_report.html", dataset=["historical", "modern"])) all_outputs.append(expand("results/gerp/all/" + REF_NAME + ".all.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_plot.pdf", fmiss=config["f_missing"], @@ -15,7 +15,7 @@ if os.path.exists(config["historical_samples"]) and os.path.exists(config["moder maxGERP=config["max_gerp"],)) elif os.path.exists(config["historical_samples"]): - all_outputs.append("results/gerp/historical/" + REF_NAME + "/vcf/stats/multiqc/multiqc_report.html") + all_outputs.append("results/historical/vcf/" + REF_NAME + "/stats/multiqc/multiqc_report.html") all_outputs.append(expand("results/gerp/historical/" + REF_NAME + ".historical.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_plot.pdf", fmiss=config["f_missing"], chr=CHR, @@ -23,7 +23,7 @@ elif os.path.exists(config["historical_samples"]): maxGERP=config["max_gerp"],)) elif os.path.exists(config["modern_samples"]): - all_outputs.append("results/gerp/modern/" + REF_NAME + "/vcf/stats/multiqc/multiqc_report.html") + all_outputs.append("results/modern/vcf/" + REF_NAME + "/stats/multiqc/multiqc_report.html") all_outputs.append(expand("results/gerp/modern/" + REF_NAME + ".modern.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_plot.pdf", fmiss=config["f_missing"], chr=CHR, @@ -32,136 +32,6 @@ elif os.path.exists(config["modern_samples"]): # Functions used by rules of this part of the pipeline -def historical_biallelic_missing_filtered_vcf_gerp_multiqc_inputs(wildcards): - """Input for historical_biallelic_missing_filtered_vcf_gerp_multiqc_inputs""" - rescaled_not_subsampled_not_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", - sample=HIST_RESCALED_NOT_SUBSAMPLED_NOT_CpG_SAMPLES, - fmiss=config["f_missing"], - chr=CHR,) - not_rescaled_not_subsampled_not_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", - sample=HIST_NOT_RESCALED_NOT_SUBSAMPLED_NOT_CpG_SAMPLES, - fmiss=config["f_missing"], - chr=CHR,) - rescaled_subsampled_not_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", - sample=HIST_RESCALED_SUBSAMPLED_NOT_CpG_SAMPLES, - DP=config["subsampling_depth"], - fmiss=config["f_missing"], - chr=CHR,) - not_rescaled_subsampled_not_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", - sample=HIST_NOT_RESCALED_SUBSAMPLED_NOT_CpG_SAMPLES, - DP=config["subsampling_depth"], - fmiss=config["f_missing"], - chr=CHR,) - outlist = (rescaled_not_subsampled_not_CpG + not_rescaled_not_subsampled_not_CpG + rescaled_subsampled_not_CpG + not_rescaled_subsampled_not_CpG) - if config["CpG_from_vcf"] == True: - rescaled_not_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", - sample=HIST_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES, - fmiss=config["f_missing"], - chr=CHR,) - not_rescaled_not_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", - sample=HIST_NOT_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES, - fmiss=config["f_missing"], - chr=CHR,) - rescaled_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", - sample=HIST_RESCALED_SUBSAMPLED_CpG_SAMPLES, - DP=config["subsampling_depth"], - fmiss=config["f_missing"], - chr=CHR,) - not_rescaled_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", - sample=HIST_NOT_RESCALED_SUBSAMPLED_CpG_SAMPLES, - DP=config["subsampling_depth"], - fmiss=config["f_missing"], - chr=CHR,) - outlist += (rescaled_not_subsampled_CpG + not_rescaled_not_subsampled_CpG + rescaled_subsampled_CpG + not_rescaled_subsampled_CpG) - elif config["CpG_from_reference"] == True: - rescaled_not_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", - sample=HIST_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES, - fmiss=config["f_missing"], - chr=CHR,) - not_rescaled_not_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", - sample=HIST_NOT_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES, - fmiss=config["f_missing"], - chr=CHR,) - rescaled_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", - sample=HIST_RESCALED_SUBSAMPLED_CpG_SAMPLES, - DP=config["subsampling_depth"], - fmiss=config["f_missing"], - chr=CHR,) - not_rescaled_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", - sample=HIST_NOT_RESCALED_SUBSAMPLED_CpG_SAMPLES, - DP=config["subsampling_depth"], - fmiss=config["f_missing"], - chr=CHR,) - outlist += (rescaled_not_subsampled_CpG + not_rescaled_not_subsampled_CpG + rescaled_subsampled_CpG + not_rescaled_subsampled_CpG) - elif config["CpG_from_vcf_and_reference"] == True: - rescaled_not_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", - sample=HIST_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES, - fmiss=config["f_missing"], - chr=CHR,) - not_rescaled_not_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", - sample=HIST_NOT_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES, - fmiss=config["f_missing"], - chr=CHR,) - rescaled_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", - sample=HIST_RESCALED_SUBSAMPLED_CpG_SAMPLES, - DP=config["subsampling_depth"], - fmiss=config["f_missing"], - chr=CHR,) - not_rescaled_subsampled_CpG = expand("results/gerp/historical/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", - sample=HIST_NOT_RESCALED_SUBSAMPLED_CpG_SAMPLES, - DP=config["subsampling_depth"], - fmiss=config["f_missing"], - chr=CHR,) - outlist += (rescaled_not_subsampled_CpG + not_rescaled_not_subsampled_CpG + rescaled_subsampled_CpG + not_rescaled_subsampled_CpG) - return outlist - -def modern_biallelic_missing_filtered_vcf_gerp_multiqc_inputs(wildcards): - """Input for modern_biallelic_missing_filtered_vcf_gerp_multiqc_inputs""" - not_subsampled_not_CpG = expand("results/gerp/modern/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", - sample=MODERN_NOT_SUBSAMPLED_NOT_CpG_SAMPLES, - fmiss=config["f_missing"], - chr=CHR,) - subsampled_not_CpG = expand("results/gerp/modern/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", - sample=MODERN_SUBSAMPLED_NOT_CpG_SAMPLES, - DP=config["subsampling_depth"], - fmiss=config["f_missing"], - chr=CHR,) - outlist = (not_subsampled_not_CpG + subsampled_not_CpG) - if config["CpG_from_vcf"] == True: - not_subsampled_CpG = expand("results/gerp/modern/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", - sample=MODERN_NOT_SUBSAMPLED_CpG_SAMPLES, - fmiss=config["f_missing"], - chr=CHR,) - subsampled_CpG = expand("results/gerp/modern/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", - sample=MODERN_SUBSAMPLED_CpG_SAMPLES, - DP=config["subsampling_depth"], - fmiss=config["f_missing"], - chr=CHR,) - outlist += (not_subsampled_CpG + subsampled_CpG) - elif config["CpG_from_reference"] == True: - not_subsampled_CpG = expand("results/gerp/modern/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", - sample=MODERN_NOT_SUBSAMPLED_CpG_SAMPLES, - fmiss=config["f_missing"], - chr=CHR,) - subsampled_CpG = expand("results/gerp/modern/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", - sample=MODERN_SUBSAMPLED_CpG_SAMPLES, - DP=config["subsampling_depth"], - fmiss=config["f_missing"], - chr=CHR,) - outlist += (not_subsampled_CpG + subsampled_CpG) - elif config["CpG_from_vcf_and_reference"] == True: - not_subsampled_CpG = expand("results/gerp/modern/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", - sample=MODERN_NOT_SUBSAMPLED_CpG_SAMPLES, - fmiss=config["f_missing"], - chr=CHR,) - subsampled_CpG = expand("results/gerp/modern/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", - sample=MODERN_SUBSAMPLED_CpG_SAMPLES, - DP=config["subsampling_depth"], - fmiss=config["f_missing"], - chr=CHR,) - outlist += (not_subsampled_CpG + subsampled_CpG) - return outlist - def rel_load_table_inputs(wildcards): """Collect output files for pipeline report""" outlist = [] @@ -818,100 +688,10 @@ rule plot_gerp_hist: "../scripts/gerp_hist_plot.py" -rule repmasked_bcf2vcf_gerp: - """Convert bcf format to vcf.gz for removal of sites""" - input: - bcf="results/{dataset}/vcf/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.bcf", - index="results/{dataset}/vcf/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.bcf.csi", - output: - vcf=temp("results/gerp/chunks/" + REF_NAME + "/{dataset}/vcf/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.vcf.gz"), - log: - "results/logs/13_GERP/chunks/" + REF_NAME + "/{dataset}/vcf/{sample}.{processed}_repmasked_bcf2vcf.log", - singularity: - "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" - shell: - """ - bcftools convert -O z -o {output.vcf} {input.bcf} 2> {log} - """ - - -rule filter_biallelic_missing_vcf_gerp: - """Keep only sites with certain upper fraction missingness as specified in config file and sites that are biallelic across all samples from individual vcf files""" - input: - vcf=rules.repmasked_bcf2vcf_gerp.output.vcf, - bed=rules.filtered_vcf2bed.output.bed, - genomefile=rules.genome_file.output.genomefile, - output: - filtered=temp("results/gerp/{dataset}/" + REF_NAME + "/vcf/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.gz"), - threads: 6 - log: - "results/logs/13_GERP/{dataset}/" + REF_NAME + "/vcf/{sample}.{processed}_fmissing{fmiss}.{chr}_filter_biallelic_missing_vcf.log", - singularity: - "docker://nbisweden/generode-bedtools-2.29.2" - shell: - """ - bedtools intersect -a {input.vcf} -b {input.bed} -header -sorted -g {input.genomefile} | bgzip -c > {output.filtered} 2> {log} - """ - - -rule biallelic_missing_filtered_vcf_gerp_stats: - """Obtain summary stats of filtered vcf file""" - input: - filtered="results/gerp/{dataset}/" + REF_NAME + "/vcf/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.gz", - output: - stats="results/gerp/{dataset}/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", - log: - "results/logs/13_GERP/{dataset}/" + REF_NAME + "/vcf/{sample}.{processed}_fmissing{fmiss}.{chr}_biallelic_missing_filtered_vcf_stats.log", - singularity: - "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" - shell: - """ - bcftools stats {input.filtered} > {output.stats} 2> {log} - """ - - -rule historical_biallelic_missing_filtered_vcf_gerp_multiqc: - """Collect all stats files from historical vcf files filtered for biallelic sites and missing data""" - input: - historical_biallelic_missing_filtered_vcf_gerp_multiqc_inputs, - output: - stats="results/gerp/historical/" + REF_NAME + "/vcf/stats/multiqc/multiqc_report.html", - params: - indir="results/gerp/historical/" + REF_NAME + "/vcf/stats/", - outdir="results/gerp/historical/" + REF_NAME + "/vcf/stats/multiqc", - log: - "results/logs/13_GERP/historical/" + REF_NAME + "/vcf/biallelic_missing_filtered_vcf_multiqc.log", - singularity: - "docker://quay.io/biocontainers/multiqc:1.9--pyh9f0ad1d_0" - shell: - """ - multiqc -f {params.indir} -o {params.outdir} 2> {log} - """ - - -rule modern_biallelic_missing_filtered_vcf_gerp_multiqc: - """Collect all stats files from modern vcf files filtered for biallelic sites and missing data""" - input: - modern_biallelic_missing_filtered_vcf_gerp_multiqc_inputs, - output: - stats="results/gerp/modern/" + REF_NAME + "/vcf/stats/multiqc/multiqc_report.html", - params: - indir="results/gerp/modern/" + REF_NAME + "/vcf/stats/", - outdir="results/gerp/modern/" + REF_NAME + "/vcf/stats/multiqc", - log: - "results/logs/13_GERP/modern/" + REF_NAME + "/vcf/biallelic_missing_filtered_vcf_multiqc.log", - singularity: - "docker://quay.io/biocontainers/multiqc:1.9--pyh9f0ad1d_0" - shell: - """ - multiqc -f {params.indir} -o {params.outdir} 2> {log} - """ - - rule split_vcf_files: - """Split the VCF files into chunks for more resource-efficient merging with GERP results""" + """Split individual VCF files into chunks for more resource-efficient merging with GERP results""" input: - vcf="results/gerp/{dataset}/" + REF_NAME + "/vcf/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.gz", + vcf=rules.filter_biallelic_missing_vcf.output.filtered, chunk_bed=REF_DIR + "/gerp/" + REF_NAME + "/split_bed_files/{chunk}.bed", genomefile=REF_DIR + "/" + REF_NAME + ".genome", output: diff --git a/workflow/rules/9_merge_vcfs.smk b/workflow/rules/9_merge_vcfs.smk index 2059bdb..7925c5c 100644 --- a/workflow/rules/9_merge_vcfs.smk +++ b/workflow/rules/9_merge_vcfs.smk @@ -166,6 +166,137 @@ def missingness_filtered_vcf_multiqc_inputs(wildcards): chr=CHR,) +def historical_biallelic_missing_filtered_vcf_multiqc_inputs(wildcards): + """Input for historical_biallelic_missing_filtered_vcf_multiqc_inputs""" + rescaled_not_subsampled_not_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + sample=HIST_RESCALED_NOT_SUBSAMPLED_NOT_CpG_SAMPLES, + fmiss=config["f_missing"], + chr=CHR,) + not_rescaled_not_subsampled_not_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + sample=HIST_NOT_RESCALED_NOT_SUBSAMPLED_NOT_CpG_SAMPLES, + fmiss=config["f_missing"], + chr=CHR,) + rescaled_subsampled_not_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + sample=HIST_RESCALED_SUBSAMPLED_NOT_CpG_SAMPLES, + DP=config["subsampling_depth"], + fmiss=config["f_missing"], + chr=CHR,) + not_rescaled_subsampled_not_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + sample=HIST_NOT_RESCALED_SUBSAMPLED_NOT_CpG_SAMPLES, + DP=config["subsampling_depth"], + fmiss=config["f_missing"], + chr=CHR,) + outlist = (rescaled_not_subsampled_not_CpG + not_rescaled_not_subsampled_not_CpG + rescaled_subsampled_not_CpG + not_rescaled_subsampled_not_CpG) + if config["CpG_from_vcf"] == True: + rescaled_not_subsampled_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + sample=HIST_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES, + fmiss=config["f_missing"], + chr=CHR,) + not_rescaled_not_subsampled_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + sample=HIST_NOT_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES, + fmiss=config["f_missing"], + chr=CHR,) + rescaled_subsampled_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + sample=HIST_RESCALED_SUBSAMPLED_CpG_SAMPLES, + DP=config["subsampling_depth"], + fmiss=config["f_missing"], + chr=CHR,) + not_rescaled_subsampled_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + sample=HIST_NOT_RESCALED_SUBSAMPLED_CpG_SAMPLES, + DP=config["subsampling_depth"], + fmiss=config["f_missing"], + chr=CHR,) + outlist += (rescaled_not_subsampled_CpG + not_rescaled_not_subsampled_CpG + rescaled_subsampled_CpG + not_rescaled_subsampled_CpG) + elif config["CpG_from_reference"] == True: + rescaled_not_subsampled_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + sample=HIST_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES, + fmiss=config["f_missing"], + chr=CHR,) + not_rescaled_not_subsampled_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + sample=HIST_NOT_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES, + fmiss=config["f_missing"], + chr=CHR,) + rescaled_subsampled_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + sample=HIST_RESCALED_SUBSAMPLED_CpG_SAMPLES, + DP=config["subsampling_depth"], + fmiss=config["f_missing"], + chr=CHR,) + not_rescaled_subsampled_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + sample=HIST_NOT_RESCALED_SUBSAMPLED_CpG_SAMPLES, + DP=config["subsampling_depth"], + fmiss=config["f_missing"], + chr=CHR,) + outlist += (rescaled_not_subsampled_CpG + not_rescaled_not_subsampled_CpG + rescaled_subsampled_CpG + not_rescaled_subsampled_CpG) + elif config["CpG_from_vcf_and_reference"] == True: + rescaled_not_subsampled_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + sample=HIST_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES, + fmiss=config["f_missing"], + chr=CHR,) + not_rescaled_not_subsampled_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + sample=HIST_NOT_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES, + fmiss=config["f_missing"], + chr=CHR,) + rescaled_subsampled_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + sample=HIST_RESCALED_SUBSAMPLED_CpG_SAMPLES, + DP=config["subsampling_depth"], + fmiss=config["f_missing"], + chr=CHR,) + not_rescaled_subsampled_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + sample=HIST_NOT_RESCALED_SUBSAMPLED_CpG_SAMPLES, + DP=config["subsampling_depth"], + fmiss=config["f_missing"], + chr=CHR,) + outlist += (rescaled_not_subsampled_CpG + not_rescaled_not_subsampled_CpG + rescaled_subsampled_CpG + not_rescaled_subsampled_CpG) + return outlist + +def modern_biallelic_missing_filtered_vcf_multiqc_inputs(wildcards): + """Input for modern_biallelic_missing_filtered_vcf_multiqc_inputs""" + not_subsampled_not_CpG = expand("results/modern/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + sample=MODERN_NOT_SUBSAMPLED_NOT_CpG_SAMPLES, + fmiss=config["f_missing"], + chr=CHR,) + subsampled_not_CpG = expand("results/modern/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + sample=MODERN_SUBSAMPLED_NOT_CpG_SAMPLES, + DP=config["subsampling_depth"], + fmiss=config["f_missing"], + chr=CHR,) + outlist = (not_subsampled_not_CpG + subsampled_not_CpG) + if config["CpG_from_vcf"] == True: + not_subsampled_CpG = expand("results/modern/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + sample=MODERN_NOT_SUBSAMPLED_CpG_SAMPLES, + fmiss=config["f_missing"], + chr=CHR,) + subsampled_CpG = expand("results/modern/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + sample=MODERN_SUBSAMPLED_CpG_SAMPLES, + DP=config["subsampling_depth"], + fmiss=config["f_missing"], + chr=CHR,) + outlist += (not_subsampled_CpG + subsampled_CpG) + elif config["CpG_from_reference"] == True: + not_subsampled_CpG = expand("results/modern/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + sample=MODERN_NOT_SUBSAMPLED_CpG_SAMPLES, + fmiss=config["f_missing"], + chr=CHR,) + subsampled_CpG = expand("results/modern/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + sample=MODERN_SUBSAMPLED_CpG_SAMPLES, + DP=config["subsampling_depth"], + fmiss=config["f_missing"], + chr=CHR,) + outlist += (not_subsampled_CpG + subsampled_CpG) + elif config["CpG_from_vcf_and_reference"] == True: + not_subsampled_CpG = expand("results/modern/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + sample=MODERN_NOT_SUBSAMPLED_CpG_SAMPLES, + fmiss=config["f_missing"], + chr=CHR,) + subsampled_CpG = expand("results/modern/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + sample=MODERN_SUBSAMPLED_CpG_SAMPLES, + DP=config["subsampling_depth"], + fmiss=config["f_missing"], + chr=CHR,) + outlist += (not_subsampled_CpG + subsampled_CpG) + return outlist + + # snakemake rules rule merge_all_vcfs: """Merge all samples into one VCF file, containing only SNPs""" @@ -483,6 +614,99 @@ rule missingness_filtered_vcf_multiqc: "results/logs/9_merge_vcfs/all/" + REF_NAME + "/missingness_filtered_vcf_multiqc.log", singularity: "docker://quay.io/biocontainers/multiqc:1.9--pyh9f0ad1d_0" + shell: + """ + multiqc -f {params.indir} -o {params.outdir} 2> {log} + """ + + +# Rules to filter individual BCF files for biallelic sites, missingness and sex-chromosomal scaffolds +# for snpEff and GERP steps. Only triggered when snpEff or GERP is run. + +rule repmasked_bcf2vcf: + """Convert bcf format to vcf.gz for removal of sites""" + input: + bcf="results/{dataset}/vcf/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.bcf", + index="results/{dataset}/vcf/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.bcf.csi", + output: + vcf=temp("results/{dataset}/vcf/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.tmp.vcf.gz"), + log: + "results/logs/9_merge_vcfs/{dataset}/" + REF_NAME + "/{sample}.{processed}_repmasked_bcf2vcf.log", + singularity: + "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" + shell: + """ + bcftools convert -O z -o {output.vcf} {input.bcf} 2> {log} + """ + + +rule filter_biallelic_missing_vcf: + """Keep only sites with certain upper fraction missingness as specified in config file and sites that are biallelic across all samples from individual vcf files""" + input: + vcf=rules.repmasked_bcf2vcf.output.vcf, + bed=rules.filtered_vcf2bed.output.bed, + genomefile=rules.genome_file.output.genomefile, + output: + filtered="results/{dataset}/vcf/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.gz", + threads: 6 + log: + "results/logs/9_merge_vcfs/{dataset}/" + REF_NAME + "/{sample}.{processed}_fmissing{fmiss}.{chr}_filter_biallelic_missing_vcf.log", + singularity: + "docker://nbisweden/generode-bedtools-2.29.2" + shell: + """ + bedtools intersect -a {input.vcf} -b {input.bed} -header -sorted -g {input.genomefile} | bgzip -c > {output.filtered} 2> {log} + """ + + +rule biallelic_missing_filtered_vcf_stats: + """Obtain summary stats of filtered vcf file""" + input: + filtered="results/{dataset}/vcf/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.gz", + output: + stats="results/{dataset}/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + log: + "results/logs/9_merge_vcfs/{dataset}/" + REF_NAME + "/{sample}.{processed}_fmissing{fmiss}.{chr}_biallelic_missing_filtered_vcf_stats.log", + singularity: + "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" + shell: + """ + bcftools stats {input.filtered} > {output.stats} 2> {log} + """ + + +rule historical_biallelic_missing_filtered_vcf_multiqc: + """Collect all stats files from historical vcf files filtered for biallelic sites and missing data""" + input: + historical_biallelic_missing_filtered_vcf_multiqc_inputs, + output: + stats="results/historical/vcf/" + REF_NAME + "/stats/multiqc/multiqc_report.html", + params: + indir="results/historical/vcf/" + REF_NAME + "/stats/", + outdir="results/historical/vcf/" + REF_NAME + "/stats/multiqc", + log: + "results/logs/9_merge_vcfs/historical/" + REF_NAME + "/biallelic_missing_filtered_vcf_multiqc.log", + singularity: + "docker://quay.io/biocontainers/multiqc:1.9--pyh9f0ad1d_0" + shell: + """ + multiqc -f {params.indir} -o {params.outdir} 2> {log} + """ + + +rule modern_biallelic_missing_filtered_vcf_multiqc: + """Collect all stats files from modern vcf files filtered for biallelic sites and missing data""" + input: + modern_biallelic_missing_filtered_vcf_multiqc_inputs, + output: + stats="results/modern/vcf/" + REF_NAME + "/stats/multiqc/multiqc_report.html", + params: + indir="results/modern/vcf/" + REF_NAME + "/stats/", + outdir="results/modern/vcf/" + REF_NAME + "/stats/multiqc", + log: + "results/logs/9_merge_vcfs/modern/" + REF_NAME + "/biallelic_missing_filtered_vcf_multiqc.log", + singularity: + "docker://quay.io/biocontainers/multiqc:1.9--pyh9f0ad1d_0" shell: """ multiqc -f {params.indir} -o {params.outdir} 2> {log} From 0307570d7e4a6f653acc7dd8ea7c6d922e4608af Mon Sep 17 00:00:00 2001 From: verku Date: Fri, 6 Oct 2023 12:29:41 +0200 Subject: [PATCH 22/49] Run GERP for autosomes only if sex-chromosomal scaffold list is provided or for the whole genome if not --- workflow/rules/13_GERP.smk | 87 +++++++++++++++++++------------------- workflow/rules/common.smk | 17 +++++--- 2 files changed, 55 insertions(+), 49 deletions(-) diff --git a/workflow/rules/13_GERP.smk b/workflow/rules/13_GERP.smk index b38b06a..42ae458 100644 --- a/workflow/rules/13_GERP.smk +++ b/workflow/rules/13_GERP.smk @@ -3,7 +3,8 @@ # Authors: Marcin Kierczak, Tom van der Valk, Verena Kutschera # Code collecting output files from this part of the pipeline -all_outputs.append("results/gerp/" + REF_NAME + ".ancestral.rates.gerp.hist.pdf") +all_outputs.append(expand("results/gerp/" + REF_NAME + ".{chr}.ancestral.rates.gerp.hist.pdf", + chr=CHR,)) if os.path.exists(config["historical_samples"]) and os.path.exists(config["modern_samples"]): all_outputs.append(expand("results/{dataset}/vcf/" + REF_NAME + "/stats/multiqc/multiqc_report.html", @@ -405,13 +406,13 @@ rule bam2fasta: bam=rules.align2target.output.bam, index=rules.index_gerp_bams.output.index, stats=rules.gerp_bam_multiqc.output.stats, - chunk_bed=REF_DIR + "/gerp/" + REF_NAME + "/split_bed_files/{chunk}.bed", + chunk_bed=REF_DIR + "/gerp/" + REF_NAME + "/split_bed_files_{chr}/{chunk}.bed", output: - fasta_dir=temp(directory("results/gerp/chunks/" + REF_NAME + "/fasta/{gerpref}_{chunk}/")), + fasta_dir=temp(directory("results/gerp/{chr}_chunks/" + REF_NAME + "/fasta/{gerpref}_{chunk}/")), params: gerpref="{gerpref}", log: - "results/logs/13_GERP/chunks/" + REF_NAME + "/fasta/{gerpref}_{chunk}_bam2fasta.log", + "results/logs/13_GERP/{chr}_chunks/" + REF_NAME + "/fasta/{gerpref}_{chunk}_bam2fasta.log", threads: 2 singularity: "docker://biocontainers/samtools:v1.9-4-deb_cv1" # This container includes python 3.7.6 with default python modules @@ -438,14 +439,14 @@ rule split_ref_contigs: input: ref=config["ref_path"], fai=config["ref_path"] + ".fai", - chunk_bed=REF_DIR + "/gerp/" + REF_NAME + "/split_bed_files/{chunk}.bed", + chunk_bed=REF_DIR + "/gerp/" + REF_NAME + "/split_bed_files_{chr}/{chunk}.bed", stats=rules.gerp_bam_multiqc.output.stats, output: - fasta_dir=temp(directory("results/gerp/chunks/" + REF_NAME + "/fasta/" + REF_NAME + "_{chunk}/")), + fasta_dir=temp(directory("results/gerp/{chr}_chunks/" + REF_NAME + "/fasta/" + REF_NAME + "_{chunk}/")), params: gerpref=REF_NAME, log: - "results/logs/13_GERP/chunks/" + REF_NAME + "/fasta/" + REF_NAME + "_{chunk}_split_ref_contigs.log", + "results/logs/13_GERP/{chr}_chunks/" + REF_NAME + "/fasta/" + REF_NAME + "_{chunk}_split_ref_contigs.log", threads: 1 singularity: "docker://quay.io/biocontainers/seqtk:1.3--hed695b0_2" @@ -471,16 +472,16 @@ rule concatenate_fasta_per_contig: This analysis is run as one job per genome chunk, but is internally run per contig. """ input: - gerpref_fasta=expand("results/gerp/chunks/" + REF_NAME + "/fasta/{gerpref}_{{chunk}}/", gerpref=GERP_REF_NAMES), + gerpref_fasta=expand("results/gerp/{chr}_chunks/" + REF_NAME + "/fasta/{gerpref}_{{chunk}}/", chr=CHR, gerpref=GERP_REF_NAMES), ref_fasta=rules.split_ref_contigs.output, - chunk_bed=REF_DIR + "/gerp/" + REF_NAME + "/split_bed_files/{chunk}.bed", + chunk_bed=REF_DIR + "/gerp/" + REF_NAME + "/split_bed_files_{chr}/{chunk}.bed", output: - concatenated_fasta_dir=temp(directory("results/gerp/chunks/" + REF_NAME + "/fasta/concatenated_{chunk}/")), + concatenated_fasta_dir=temp(directory("results/gerp/{chr}_chunks/" + REF_NAME + "/fasta/concatenated_{chunk}/")), params: - fasta_dir="results/gerp/chunks/" + REF_NAME + "/fasta", + fasta_dir="results/gerp/{chr}_chunks/" + REF_NAME + "/fasta", chunk=lambda wildcards: "{wildcards.chunk}", log: - "results/logs/13_GERP/chunks/" + REF_NAME + "/fasta/{chunk}_concatenate_fasta_per_contig.log", + "results/logs/13_GERP/{chr}_chunks/" + REF_NAME + "/fasta/{chunk}_concatenate_fasta_per_contig.log", threads: 2 run: if not os.path.exists(output.concatenated_fasta_dir): @@ -518,14 +519,14 @@ rule compute_gerp: """ input: concatenated_fasta_dir=rules.concatenate_fasta_per_contig.output, - chunk_bed=REF_DIR + "/gerp/" + REF_NAME + "/split_bed_files/{chunk}.bed", + chunk_bed=REF_DIR + "/gerp/" + REF_NAME + "/split_bed_files_{chr}/{chunk}.bed", tree=config["tree"], output: - gerp_dir=temp(directory("results/gerp/chunks/" + REF_NAME + "/gerp/{chunk}_gerp_raw/")), + gerp_dir=temp(directory("results/gerp/{chr}_chunks/" + REF_NAME + "/gerp/{chunk}_gerp_raw/")), params: name=REF_NAME, log: - "results/logs/13_GERP/chunks/" + REF_NAME + "/gerp/{chunk}_compute_gerp.log", + "results/logs/13_GERP/{chr}_chunks/" + REF_NAME + "/gerp/{chunk}_compute_gerp.log", threads: 4 singularity: "docker://quay.io/biocontainers/gerp:2.1--hfc679d8_0" @@ -552,13 +553,13 @@ rule gerp2coords: input: concatenated_fasta_dir=rules.concatenate_fasta_per_contig.output, gerp_dir=rules.compute_gerp.output, - chunk_bed=REF_DIR + "/gerp/" + REF_NAME + "/split_bed_files/{chunk}.bed", + chunk_bed=REF_DIR + "/gerp/" + REF_NAME + "/split_bed_files_{chr}/{chunk}.bed", output: - gerp_coords_dir=temp(directory("results/gerp/chunks/" + REF_NAME + "/gerp/{chunk}_gerp_coords/")), + gerp_coords_dir=temp(directory("results/gerp/{chr}_chunks/" + REF_NAME + "/gerp/{chunk}_gerp_coords/")), params: name=REF_NAME, log: - "results/logs/13_GERP/chunks/" + REF_NAME + "/gerp/{chunk}_gerp2coords.log", + "results/logs/13_GERP/{chr}_chunks/" + REF_NAME + "/gerp/{chunk}_gerp2coords.log", threads: 2 run: chunk_contigs = [] @@ -582,13 +583,13 @@ rule get_ancestral_state: """Get the ancestral state of each position in the focal reference genome.""" input: concatenated_fasta_dir=rules.concatenate_fasta_per_contig.output, - chunk_bed=REF_DIR + "/gerp/" + REF_NAME + "/split_bed_files/{chunk}.bed", + chunk_bed=REF_DIR + "/gerp/" + REF_NAME + "/split_bed_files_{chr}/{chunk}.bed", output: - fasta_ancestral_dir=temp(directory("results/gerp/chunks/" + REF_NAME + "/gerp/{chunk}_fasta_ancestral/")), + fasta_ancestral_dir=temp(directory("results/gerp/{chr}_chunks/" + REF_NAME + "/gerp/{chunk}_fasta_ancestral/")), params: name=REF_NAME, log: - "results/logs/13_GERP/chunks/" + REF_NAME + "/gerp/{chunk}_get_ancestral_state.log", + "results/logs/13_GERP/{chr}_chunks/" + REF_NAME + "/gerp/{chunk}_get_ancestral_state.log", threads: 2 run: chunk_contigs = [] @@ -612,11 +613,11 @@ rule produce_contig_out: input: fasta_ancestral_dir=rules.get_ancestral_state.output.fasta_ancestral_dir, gerp_coords_dir=rules.gerp2coords.output.gerp_coords_dir, - chunk_bed=REF_DIR + "/gerp/" + REF_NAME + "/split_bed_files/{chunk}.bed", + chunk_bed=REF_DIR + "/gerp/" + REF_NAME + "/split_bed_files_{chr}/{chunk}.bed", output: - gerp_merged_dir=temp(directory("results/gerp/chunks/" + REF_NAME + "/gerp/{chunk}_gerp_merged/")), + gerp_merged_dir=temp(directory("results/gerp/{chr}_chunks/" + REF_NAME + "/gerp/{chunk}_gerp_merged/")), log: - "results/logs/13_GERP/chunks/" + REF_NAME + "/gerp/{chunk}_produce_contig_out.log", + "results/logs/13_GERP/{chr}_chunks/" + REF_NAME + "/gerp/{chunk}_produce_contig_out.log", threads: 2 run: chunk_contigs = [] @@ -639,11 +640,11 @@ rule merge_gerp_per_chunk: """Merge results per genome chunk into one file.""" input: gerp_merged_dir=rules.produce_contig_out.output.gerp_merged_dir, - chunk_bed=REF_DIR + "/gerp/" + REF_NAME + "/split_bed_files/{chunk}.bed", + chunk_bed=REF_DIR + "/gerp/" + REF_NAME + "/split_bed_files_{chr}/{chunk}.bed", output: - gerp_chunks_merged=temp("results/gerp/chunks/" + REF_NAME + "/gerp/{chunk}.fasta.parsed.rates"), + gerp_chunks_merged=temp("results/gerp/{chr}_chunks/" + REF_NAME + "/gerp/{chunk}.fasta.parsed.rates"), log: - "results/logs/13_GERP/chunks/" + REF_NAME + "/gerp/{chunk}_merge_per_chunk.log", + "results/logs/13_GERP/{chr}_chunks/" + REF_NAME + "/gerp/{chunk}_merge_per_chunk.log", threads: 2 run: chunk_contigs = [] @@ -661,11 +662,11 @@ rule merge_gerp_per_chunk: rule merge_gerp_gz: """Merge results per contig into one file.""" input: - gerp_chunks_merged=expand("results/gerp/chunks/" + REF_NAME + "/gerp/{chunk}.fasta.parsed.rates", chunk=CHUNKS), + gerp_chunks_merged=expand("results/gerp/{chr}_chunks/" + REF_NAME + "/gerp/{chunk}.fasta.parsed.rates", chr=CHR, chunk=CHUNKS), output: - gerp_out="results/gerp/" + REF_NAME + ".ancestral.rates.gz", + gerp_out="results/gerp/" + REF_NAME + ".{chr}.ancestral.rates.gz", log: - "results/logs/13_GERP/" + REF_NAME + "_merge_gerp_gz.log", + "results/logs/13_GERP/" + REF_NAME + ".{chr}.merge_gerp_gz.log", threads: 2 shell: """ @@ -678,12 +679,12 @@ rule plot_gerp_hist: input: gerp_out=rules.merge_gerp_gz.output.gerp_out, output: - pdf=report("results/gerp/" + REF_NAME + ".ancestral.rates.gerp.hist.pdf", + pdf=report("results/gerp/" + REF_NAME + ".{chr}.ancestral.rates.gerp.hist.pdf", caption="../report/gerp_plot.rst", category="GERP",), threads: 2 log: - "results/logs/13_GERP/" + REF_NAME + "_plot_gerp_hist.log", + "results/logs/13_GERP/" + REF_NAME + ".{chr}.plot_gerp_hist.log", script: "../scripts/gerp_hist_plot.py" @@ -692,12 +693,12 @@ rule split_vcf_files: """Split individual VCF files into chunks for more resource-efficient merging with GERP results""" input: vcf=rules.filter_biallelic_missing_vcf.output.filtered, - chunk_bed=REF_DIR + "/gerp/" + REF_NAME + "/split_bed_files/{chunk}.bed", + chunk_bed=REF_DIR + "/gerp/" + REF_NAME + "/split_bed_files_{chr}/{chunk}.bed", genomefile=REF_DIR + "/" + REF_NAME + ".genome", output: - vcf_chunk=temp("results/gerp/chunks/" + REF_NAME + "/{dataset}/vcf/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.{chunk}.vcf.gz"), + vcf_chunk=temp("results/gerp/{chr}_chunks/" + REF_NAME + "/{dataset}/vcf/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.{chunk}.vcf.gz"), log: - "results/logs/13_GERP/chunks/" + REF_NAME + "/{dataset}/vcf/{sample}.{processed}_fmissing{fmiss}.{chr}.{chunk}_split_vcf_chunks.log", + "results/logs/13_GERP/{chr}_chunks/" + REF_NAME + "/{dataset}/vcf/{sample}.{processed}_fmissing{fmiss}.{chr}.{chunk}_split_vcf_chunks.log", singularity: "docker://nbisweden/generode-bedtools-2.29.2" shell: @@ -709,11 +710,11 @@ rule split_vcf_files: rule split_chunk_bed_files: """Split the chunk bed files into 10 million basepair windows""" input: - chunk_bed=REF_DIR + "/gerp/" + REF_NAME + "/split_bed_files/{chunk}.bed", + chunk_bed=REF_DIR + "/gerp/" + REF_NAME + "/split_bed_files_{chr}/{chunk}.bed", output: - chunk_win_bed=temp(REF_DIR + "/gerp/" + REF_NAME + "/split_bed_files/windows/{chunk}_10Mwindows.bed"), + chunk_win_bed=temp(REF_DIR + "/gerp/" + REF_NAME + "/split_bed_files_{chr}/windows/{chunk}_10Mwindows.bed"), log: - "results/logs/13_GERP/" + REF_NAME + ".{chunk}_split_chunk_bed_files.log", + "results/logs/13_GERP/" + REF_NAME + ".{chunk}_{chr}_split_chunk_bed_files.log", singularity: "docker://nbisweden/generode-bedtools-2.29.2" shell: @@ -732,9 +733,9 @@ rule gerp_derived_alleles: vcf=rules.split_vcf_files.output.vcf_chunk, chunk_win_bed=rules.split_chunk_bed_files.output.chunk_win_bed, output: - gerp_alleles_dir=temp(directory("results/gerp/chunks/" + REF_NAME + "/{dataset}/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.{chunk}_gerp_derived_alleles/")), + gerp_alleles_dir=temp(directory("results/gerp/{chr}_chunks/" + REF_NAME + "/{dataset}/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.{chunk}_gerp_derived_alleles/")), log: - "results/logs/13_GERP/chunks/" + REF_NAME + "/{dataset}/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.{chunk}_gerp_derived_alleles.log", + "results/logs/13_GERP/{chr}_chunks/" + REF_NAME + "/{dataset}/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.{chunk}_gerp_derived_alleles.log", threads: 4 shell: """ @@ -754,9 +755,9 @@ rule merge_gerp_alleles_per_chunk: gerp_alleles_dir=rules.gerp_derived_alleles.output.gerp_alleles_dir, chunk_win_bed=rules.split_chunk_bed_files.output.chunk_win_bed, output: - gerp_chunks_merged=temp("results/gerp/chunks/" + REF_NAME + "/{dataset}/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.{chunk}.fasta.parsed.rates.derived_alleles"), + gerp_chunks_merged=temp("results/gerp/{chr}_chunks/" + REF_NAME + "/{dataset}/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.{chunk}.fasta.parsed.rates.derived_alleles"), log: - "results/logs/13_GERP/chunks/" + REF_NAME + "/{dataset}/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.{chunk}_merge_gerp_alleles_per_chunk.log", + "results/logs/13_GERP/{chr}_chunks/" + REF_NAME + "/{dataset}/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.{chunk}_merge_gerp_alleles_per_chunk.log", threads: 4 run: chunk_windows = [] @@ -779,7 +780,7 @@ rule merge_gerp_alleles_per_chunk: rule merge_gerp_alleles_gz: """Merge results into one file per sample.""" input: - gerp_chunks_merged=expand("results/gerp/chunks/" + REF_NAME + "/{{dataset}}/{{sample}}.merged.rmdup.merged.{{processed}}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.{chunk}.fasta.parsed.rates.derived_alleles", + gerp_chunks_merged=expand("results/gerp/{chr}_chunks/" + REF_NAME + "/{{dataset}}/{{sample}}.merged.rmdup.merged.{{processed}}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.{chunk}.fasta.parsed.rates.derived_alleles", chunk=CHUNKS, fmiss=config["f_missing"], chr=CHR, diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index be6c9fa..6a9214f 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -397,8 +397,9 @@ def create_refbedfile(reference_fasta, bedfile): if seq_length > 0: bedfile_out.write(contig + "\t0\t" + str(seq_length) + "\n") -def split_ref_bed(refbedfile, outdir): +def split_ref_bed(refbedfile, outdir, chromosomelist): bed_df = pd.read_csv(refbedfile, sep="\t", header=None) + bed_df = bed_df[~bed_df[0].isin(chromosomelist)] # remove any scaffolds/contigs in the list of sex-chromosomal scaffolds/contigs, if provided if len(bed_df) >= 200: lines = len(bed_df) // 200 elif len(bed_df) < 200: @@ -427,7 +428,10 @@ if config["gerp"]: # create chunk bed files and chunk list ref_bed = REF_DIR + "/" + REF_NAME + ".bed" - chunk_bed_outdir = REF_DIR + "/gerp/" + REF_NAME + "/split_bed_files/" + if len(sexchromosomeList) > 0: + chunk_bed_outdir = REF_DIR + "/gerp/" + REF_NAME + "/split_bed_files_autos/" + elif len(sexchromosomeList) == 0: + chunk_bed_outdir = REF_DIR + "/gerp/" + REF_NAME + "/split_bed_files_genome/" # create output directory for chunk bed files, if not present yet if not os.path.exists(chunk_bed_outdir): @@ -440,10 +444,11 @@ if config["gerp"]: print("Created reference genome bed file: ", config["ref_path"], ref_bed) # split the reference bed file into chunks and store a list of the chunk names in a list - CHUNK_BED_FILES = [file for file in os.listdir(chunk_bed_outdir) if file.endswith(".bed")] # create a list of the chunk bed files present in the directory - if len(CHUNK_BED_FILES) == 0: # check if splitting needs to be run - split_ref_bed(ref_bed, chunk_bed_outdir) - CHUNK_BED_FILES = [file for file in os.listdir(chunk_bed_outdir) if file.endswith(".bed")] # replace list of the chunk bed files present in the directory after (re-)running the splitting + split_ref_bed(ref_bed, chunk_bed_outdir, sexchromosomeList) + CHUNK_BED_FILES = [file for file in os.listdir(chunk_bed_outdir) if file.endswith(".bed")] # list of the chunk bed files present in the directory after running the splitting + if len(sexchromosomeList) > 0: + print("Split the reference genome bed file into chunks, excluding sex-chromosomal scaffolds/contigs") + elif len(sexchromosomeList) == 0: print("Split the reference genome bed file into chunks") CHUNKS = [bed.replace(".bed", "") for bed in CHUNK_BED_FILES] From f01ceca62769745897c737b33ba1d6aab5be37c3 Mon Sep 17 00:00:00 2001 From: verku Date: Fri, 6 Oct 2023 13:55:07 +0200 Subject: [PATCH 23/49] Update gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index d53faab..4365518 100644 --- a/.gitignore +++ b/.gitignore @@ -22,4 +22,5 @@ tmpConsensi.fa .test/data/references/*pac .test/data/references/*sa .test/data/references/*genome -.test/data/references/*upper.fasta \ No newline at end of file +.test/data/references/*upper.fasta +.test/data/references/gerp \ No newline at end of file From b3d6b33c68d75180087ccd71c36c968c4a50ad6d Mon Sep 17 00:00:00 2001 From: verku Date: Mon, 9 Oct 2023 09:39:14 +0200 Subject: [PATCH 24/49] Update to match pipeline config file and relax missingness filter --- .test/config/config_mitogenomes.yaml | 10 ++++------ .test/config/config_mlRho_options.yaml | 10 ++++------ .test/config/config_pca_roh.yaml | 10 ++++------ .test/config/config_snpeff_gerp.yaml | 10 ++++------ 4 files changed, 16 insertions(+), 24 deletions(-) diff --git a/.test/config/config_mitogenomes.yaml b/.test/config/config_mitogenomes.yaml index be5989b..f90e9c8 100644 --- a/.test/config/config_mitogenomes.yaml +++ b/.test/config/config_mitogenomes.yaml @@ -371,8 +371,10 @@ merge_vcfs_per_dataset: False # Maximum allowed fraction of missing genotypes across all samples for a # site to be kept in the BCF and BED file, to ensure that the same sites -# are compared between historical and modern samples. -f_missing: 0.1 # default: 0.1 (i.e. maximum 10% missing genotypes per site) +# are compared between historical and modern samples. Has to be a floating +# point number between 0.0 (no missing data allowed) and 1.0 (sites are +# allowed that are completely missing). +f_missing: 0.9 # default: 0.1 (i.e. maximum 10% missing genotypes per site) ##### ################################################################# @@ -500,10 +502,6 @@ gerp_ref_path: "" # without ".fa.gz", ".fasta.gz" or ".fna.gz". tree: "" -# Tree scaling factor for GERP++ ("-s") to re-scale tree. If a tree from -# www.timetree.org is provided (in millions of years), set to 0.001. -tree_scaling_factor: 0.001 - # Minimum and maximum GERP score for a site to be included into calculations # of relative mutational load. # Positive values indicate purifying selection. diff --git a/.test/config/config_mlRho_options.yaml b/.test/config/config_mlRho_options.yaml index 774f94b..88ae094 100644 --- a/.test/config/config_mlRho_options.yaml +++ b/.test/config/config_mlRho_options.yaml @@ -371,8 +371,10 @@ merge_vcfs_per_dataset: False # Maximum allowed fraction of missing genotypes across all samples for a # site to be kept in the BCF and BED file, to ensure that the same sites -# are compared between historical and modern samples. -f_missing: 0.1 # default: 0.1 (i.e. maximum 10% missing genotypes per site) +# are compared between historical and modern samples. Has to be a floating +# point number between 0.0 (no missing data allowed) and 1.0 (sites are +# allowed that are completely missing). +f_missing: 0.9 # default: 0.1 (i.e. maximum 10% missing genotypes per site) ##### ################################################################# @@ -500,10 +502,6 @@ gerp_ref_path: "" # without ".fa.gz", ".fasta.gz" or ".fna.gz". tree: "" -# Tree scaling factor for GERP++ ("-s") to re-scale tree. If a tree from -# www.timetree.org is provided (in millions of years), set to 0.001. -tree_scaling_factor: 0.001 - # Minimum and maximum GERP score for a site to be included into calculations # of relative mutational load. # Positive values indicate purifying selection. diff --git a/.test/config/config_pca_roh.yaml b/.test/config/config_pca_roh.yaml index 08f6c6e..401f73c 100644 --- a/.test/config/config_pca_roh.yaml +++ b/.test/config/config_pca_roh.yaml @@ -371,8 +371,10 @@ merge_vcfs_per_dataset: False # Maximum allowed fraction of missing genotypes across all samples for a # site to be kept in the BCF and BED file, to ensure that the same sites -# are compared between historical and modern samples. -f_missing: 0.5 # default: 0.1 (i.e. maximum 10% missing genotypes per site) +# are compared between historical and modern samples. Has to be a floating +# point number between 0.0 (no missing data allowed) and 1.0 (sites are +# allowed that are completely missing). +f_missing: 0.9 # default: 0.1 (i.e. maximum 10% missing genotypes per site) ##### ################################################################# @@ -500,10 +502,6 @@ gerp_ref_path: "" # without ".fa.gz", ".fasta.gz" or ".fna.gz". tree: "" -# Tree scaling factor for GERP++ ("-s") to re-scale tree. If a tree from -# www.timetree.org is provided (in millions of years), set to 0.001. -tree_scaling_factor: 0.001 - # Minimum and maximum GERP score for a site to be included into calculations # of relative mutational load. # Positive values indicate purifying selection. diff --git a/.test/config/config_snpeff_gerp.yaml b/.test/config/config_snpeff_gerp.yaml index a8db6e3..3bda1b3 100644 --- a/.test/config/config_snpeff_gerp.yaml +++ b/.test/config/config_snpeff_gerp.yaml @@ -371,8 +371,10 @@ merge_vcfs_per_dataset: False # Maximum allowed fraction of missing genotypes across all samples for a # site to be kept in the BCF and BED file, to ensure that the same sites -# are compared between historical and modern samples. -f_missing: 0.1 # default: 0.1 (i.e. maximum 10% missing genotypes per site) +# are compared between historical and modern samples. Has to be a floating +# point number between 0.0 (no missing data allowed) and 1.0 (sites are +# allowed that are completely missing). +f_missing: 0.9 # default: 0.1 (i.e. maximum 10% missing genotypes per site) ##### ################################################################# @@ -506,10 +508,6 @@ tree: ".test/data/gerp_data/gerp_tree.nwk" min_gerp: 0 max_gerp: 1000 -# Tree scaling factor for GERP++ ("-s") to re-scale tree. If a tree from -# www.timetree.org is provided (in millions of years), set to 0.001. -tree_scaling_factor: 0.001 - ##### # NOTE: # The GERP step produces a large number of large intermediate files, From 8d9c521255156783e02757e86239984e814afea8 Mon Sep 17 00:00:00 2001 From: verku Date: Mon, 9 Oct 2023 09:40:04 +0200 Subject: [PATCH 25/49] Update pipeline version --- .test/config/config_mitogenomes.yaml | 2 +- .test/config/config_mlRho_options.yaml | 2 +- .test/config/config_pca_roh.yaml | 2 +- .test/config/config_snpeff_gerp.yaml | 2 +- Snakefile | 2 +- config/config.yaml | 2 +- workflow/rules/common.smk | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.test/config/config_mitogenomes.yaml b/.test/config/config_mitogenomes.yaml index f90e9c8..2256053 100644 --- a/.test/config/config_mitogenomes.yaml +++ b/.test/config/config_mitogenomes.yaml @@ -1,6 +1,6 @@ ################################################################# ################################################################# -# Configuration settings for the GenErode pipeline 0.5.1 # +# Configuration settings for the GenErode pipeline 0.6.0 # # for ancient or historical samples, and modern samples # ################################################################# ################################################################# diff --git a/.test/config/config_mlRho_options.yaml b/.test/config/config_mlRho_options.yaml index 88ae094..3857fef 100644 --- a/.test/config/config_mlRho_options.yaml +++ b/.test/config/config_mlRho_options.yaml @@ -1,6 +1,6 @@ ################################################################# ################################################################# -# Configuration settings for the GenErode pipeline 0.5.1 # +# Configuration settings for the GenErode pipeline 0.6.0 # # for ancient or historical samples, and modern samples # ################################################################# ################################################################# diff --git a/.test/config/config_pca_roh.yaml b/.test/config/config_pca_roh.yaml index 401f73c..f366271 100644 --- a/.test/config/config_pca_roh.yaml +++ b/.test/config/config_pca_roh.yaml @@ -1,6 +1,6 @@ ################################################################# ################################################################# -# Configuration settings for the GenErode pipeline 0.5.1 # +# Configuration settings for the GenErode pipeline 0.6.0 # # for ancient or historical samples, and modern samples # ################################################################# ################################################################# diff --git a/.test/config/config_snpeff_gerp.yaml b/.test/config/config_snpeff_gerp.yaml index 3bda1b3..4b0f2f5 100644 --- a/.test/config/config_snpeff_gerp.yaml +++ b/.test/config/config_snpeff_gerp.yaml @@ -1,6 +1,6 @@ ################################################################# ################################################################# -# Configuration settings for the GenErode pipeline 0.5.1 # +# Configuration settings for the GenErode pipeline 0.6.0 # # for ancient or historical samples, and modern samples # ################################################################# ################################################################# diff --git a/Snakefile b/Snakefile index 1b3c4fc..4e6a75f 100644 --- a/Snakefile +++ b/Snakefile @@ -2,7 +2,7 @@ # This is the Snakefile of the GenErode pipeline for historical or # # ancient and modern samples to study patterns of genome erosion # # # -# Pipeline version 0.5.1 # +# Pipeline version 0.6.0 # # # # Written by Verena Kutschera, Marcin Kierczak and Tom van der Valk # # Email: generode@nbis.se # diff --git a/config/config.yaml b/config/config.yaml index fcbc17b..aa21d74 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,6 +1,6 @@ ################################################################# ################################################################# -# Configuration settings for the GenErode pipeline 0.5.1 # +# Configuration settings for the GenErode pipeline 0.6.0 # # for ancient or historical samples, and modern samples # ################################################################# ################################################################# diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 576cf4c..071daef 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -10,7 +10,7 @@ import pandas as pd min_version("5.19.0") -generode_version = "0.5.1" +generode_version = "0.6.0" configfile: "config/config.yaml" From bcb8dd10f6ee4a149922590fcb3a4bc05f17d114 Mon Sep 17 00:00:00 2001 From: verku Date: Mon, 9 Oct 2023 10:02:21 +0200 Subject: [PATCH 26/49] Move stats output file to own directory for multiqc to only run on the specified filtering level --- workflow/rules/9_merge_vcfs.smk | 68 ++++++++++++++++----------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/workflow/rules/9_merge_vcfs.smk b/workflow/rules/9_merge_vcfs.smk index 7925c5c..4915b7c 100644 --- a/workflow/rules/9_merge_vcfs.smk +++ b/workflow/rules/9_merge_vcfs.smk @@ -168,80 +168,80 @@ def missingness_filtered_vcf_multiqc_inputs(wildcards): def historical_biallelic_missing_filtered_vcf_multiqc_inputs(wildcards): """Input for historical_biallelic_missing_filtered_vcf_multiqc_inputs""" - rescaled_not_subsampled_not_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + rescaled_not_subsampled_not_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/vcf_biallelic_missing_{chr}/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=HIST_RESCALED_NOT_SUBSAMPLED_NOT_CpG_SAMPLES, fmiss=config["f_missing"], chr=CHR,) - not_rescaled_not_subsampled_not_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + not_rescaled_not_subsampled_not_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/vcf_biallelic_missing_{chr}/{sample}.merged.rmdup.merged.realn.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=HIST_NOT_RESCALED_NOT_SUBSAMPLED_NOT_CpG_SAMPLES, fmiss=config["f_missing"], chr=CHR,) - rescaled_subsampled_not_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + rescaled_subsampled_not_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/vcf_biallelic_missing_{chr}/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=HIST_RESCALED_SUBSAMPLED_NOT_CpG_SAMPLES, DP=config["subsampling_depth"], fmiss=config["f_missing"], chr=CHR,) - not_rescaled_subsampled_not_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + not_rescaled_subsampled_not_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/vcf_biallelic_missing_{chr}/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=HIST_NOT_RESCALED_SUBSAMPLED_NOT_CpG_SAMPLES, DP=config["subsampling_depth"], fmiss=config["f_missing"], chr=CHR,) outlist = (rescaled_not_subsampled_not_CpG + not_rescaled_not_subsampled_not_CpG + rescaled_subsampled_not_CpG + not_rescaled_subsampled_not_CpG) if config["CpG_from_vcf"] == True: - rescaled_not_subsampled_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + rescaled_not_subsampled_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/vcf_biallelic_missing_{chr}/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=HIST_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES, fmiss=config["f_missing"], chr=CHR,) - not_rescaled_not_subsampled_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + not_rescaled_not_subsampled_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/vcf_biallelic_missing_{chr}/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=HIST_NOT_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES, fmiss=config["f_missing"], chr=CHR,) - rescaled_subsampled_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + rescaled_subsampled_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/vcf_biallelic_missing_{chr}/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=HIST_RESCALED_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"], fmiss=config["f_missing"], chr=CHR,) - not_rescaled_subsampled_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + not_rescaled_subsampled_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/vcf_biallelic_missing_{chr}/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=HIST_NOT_RESCALED_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"], fmiss=config["f_missing"], chr=CHR,) outlist += (rescaled_not_subsampled_CpG + not_rescaled_not_subsampled_CpG + rescaled_subsampled_CpG + not_rescaled_subsampled_CpG) elif config["CpG_from_reference"] == True: - rescaled_not_subsampled_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + rescaled_not_subsampled_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/vcf_biallelic_missing_{chr}/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=HIST_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES, fmiss=config["f_missing"], chr=CHR,) - not_rescaled_not_subsampled_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + not_rescaled_not_subsampled_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/vcf_biallelic_missing_{chr}/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=HIST_NOT_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES, fmiss=config["f_missing"], chr=CHR,) - rescaled_subsampled_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + rescaled_subsampled_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/vcf_biallelic_missing_{chr}/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=HIST_RESCALED_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"], fmiss=config["f_missing"], chr=CHR,) - not_rescaled_subsampled_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + not_rescaled_subsampled_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/vcf_biallelic_missing_{chr}/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=HIST_NOT_RESCALED_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"], fmiss=config["f_missing"], chr=CHR,) outlist += (rescaled_not_subsampled_CpG + not_rescaled_not_subsampled_CpG + rescaled_subsampled_CpG + not_rescaled_subsampled_CpG) elif config["CpG_from_vcf_and_reference"] == True: - rescaled_not_subsampled_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + rescaled_not_subsampled_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/vcf_biallelic_missing_{chr}/{sample}.merged.rmdup.merged.realn.rescaled.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=HIST_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES, fmiss=config["f_missing"], chr=CHR,) - not_rescaled_not_subsampled_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + not_rescaled_not_subsampled_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/vcf_biallelic_missing_{chr}/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=HIST_NOT_RESCALED_NOT_SUBSAMPLED_CpG_SAMPLES, fmiss=config["f_missing"], chr=CHR,) - rescaled_subsampled_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + rescaled_subsampled_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/vcf_biallelic_missing_{chr}/{sample}.merged.rmdup.merged.realn.rescaled.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=HIST_RESCALED_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"], fmiss=config["f_missing"], chr=CHR,) - not_rescaled_subsampled_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + not_rescaled_subsampled_CpG = expand("results/historical/vcf/" + REF_NAME + "/stats/vcf_biallelic_missing_{chr}/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=HIST_NOT_RESCALED_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"], fmiss=config["f_missing"], @@ -251,44 +251,44 @@ def historical_biallelic_missing_filtered_vcf_multiqc_inputs(wildcards): def modern_biallelic_missing_filtered_vcf_multiqc_inputs(wildcards): """Input for modern_biallelic_missing_filtered_vcf_multiqc_inputs""" - not_subsampled_not_CpG = expand("results/modern/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + not_subsampled_not_CpG = expand("results/modern/vcf/" + REF_NAME + "/stats/vcf_biallelic_missing_{chr}/{sample}.merged.rmdup.merged.realn.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=MODERN_NOT_SUBSAMPLED_NOT_CpG_SAMPLES, fmiss=config["f_missing"], chr=CHR,) - subsampled_not_CpG = expand("results/modern/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + subsampled_not_CpG = expand("results/modern/vcf/" + REF_NAME + "/stats/vcf_biallelic_missing_{chr}/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=MODERN_SUBSAMPLED_NOT_CpG_SAMPLES, DP=config["subsampling_depth"], fmiss=config["f_missing"], chr=CHR,) outlist = (not_subsampled_not_CpG + subsampled_not_CpG) if config["CpG_from_vcf"] == True: - not_subsampled_CpG = expand("results/modern/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + not_subsampled_CpG = expand("results/modern/vcf/" + REF_NAME + "/stats/vcf_biallelic_missing_{chr}/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=MODERN_NOT_SUBSAMPLED_CpG_SAMPLES, fmiss=config["f_missing"], chr=CHR,) - subsampled_CpG = expand("results/modern/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + subsampled_CpG = expand("results/modern/vcf/" + REF_NAME + "/stats/vcf_biallelic_missing_{chr}/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcf.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=MODERN_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"], fmiss=config["f_missing"], chr=CHR,) outlist += (not_subsampled_CpG + subsampled_CpG) elif config["CpG_from_reference"] == True: - not_subsampled_CpG = expand("results/modern/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + not_subsampled_CpG = expand("results/modern/vcf/" + REF_NAME + "/stats/vcf_biallelic_missing_{chr}/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=MODERN_NOT_SUBSAMPLED_CpG_SAMPLES, fmiss=config["f_missing"], chr=CHR,) - subsampled_CpG = expand("results/modern/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + subsampled_CpG = expand("results/modern/vcf/" + REF_NAME + "/stats/vcf_biallelic_missing_{chr}/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_ref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=MODERN_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"], fmiss=config["f_missing"], chr=CHR,) outlist += (not_subsampled_CpG + subsampled_CpG) elif config["CpG_from_vcf_and_reference"] == True: - not_subsampled_CpG = expand("results/modern/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + not_subsampled_CpG = expand("results/modern/vcf/" + REF_NAME + "/stats/vcf_biallelic_missing_{chr}/{sample}.merged.rmdup.merged.realn.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=MODERN_NOT_SUBSAMPLED_CpG_SAMPLES, fmiss=config["f_missing"], chr=CHR,) - subsampled_CpG = expand("results/modern/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + subsampled_CpG = expand("results/modern/vcf/" + REF_NAME + "/stats/vcf_biallelic_missing_{chr}/{sample}.merged.rmdup.merged.realn.mapped_q30.subs_dp{DP}.Q30.sorted.noCpG_vcfref.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", sample=MODERN_SUBSAMPLED_CpG_SAMPLES, DP=config["subsampling_depth"], fmiss=config["f_missing"], @@ -641,7 +641,7 @@ rule repmasked_bcf2vcf: rule filter_biallelic_missing_vcf: - """Keep only sites with certain upper fraction missingness as specified in config file and sites that are biallelic across all samples from individual vcf files""" + """Keep only sites with certain upper fraction missingness as specified in config file and sites that are biallelic across all samples (and optionally autosomes) in individual vcf files""" input: vcf=rules.repmasked_bcf2vcf.output.vcf, bed=rules.filtered_vcf2bed.output.bed, @@ -664,7 +664,7 @@ rule biallelic_missing_filtered_vcf_stats: input: filtered="results/{dataset}/vcf/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.gz", output: - stats="results/{dataset}/vcf/" + REF_NAME + "/stats/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + stats="results/{dataset}/vcf/" + REF_NAME + "/stats/vcf_biallelic_missing_{chr}/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", log: "results/logs/9_merge_vcfs/{dataset}/" + REF_NAME + "/{sample}.{processed}_fmissing{fmiss}.{chr}_biallelic_missing_filtered_vcf_stats.log", singularity: @@ -676,14 +676,14 @@ rule biallelic_missing_filtered_vcf_stats: rule historical_biallelic_missing_filtered_vcf_multiqc: - """Collect all stats files from historical vcf files filtered for biallelic sites and missing data""" + """Collect all stats files from historical vcf files filtered for biallelic sites and missing data (and optionally sex chromosomes)""" input: historical_biallelic_missing_filtered_vcf_multiqc_inputs, output: - stats="results/historical/vcf/" + REF_NAME + "/stats/multiqc/multiqc_report.html", + stats="results/historical/vcf/" + REF_NAME + "/stats/vcf_biallelic_missing_{chr}/multiqc/multiqc_report.html", params: - indir="results/historical/vcf/" + REF_NAME + "/stats/", - outdir="results/historical/vcf/" + REF_NAME + "/stats/multiqc", + indir="results/historical/vcf/" + REF_NAME + "/stats/vcf_biallelic_missing_{chr}/", + outdir="results/historical/vcf/" + REF_NAME + "/stats/vcf_biallelic_missing_{chr}/multiqc", log: "results/logs/9_merge_vcfs/historical/" + REF_NAME + "/biallelic_missing_filtered_vcf_multiqc.log", singularity: @@ -695,14 +695,14 @@ rule historical_biallelic_missing_filtered_vcf_multiqc: rule modern_biallelic_missing_filtered_vcf_multiqc: - """Collect all stats files from modern vcf files filtered for biallelic sites and missing data""" + """Collect all stats files from modern vcf files filtered for biallelic sites and missing data (and optionally sex chromosomes)""" input: modern_biallelic_missing_filtered_vcf_multiqc_inputs, output: - stats="results/modern/vcf/" + REF_NAME + "/stats/multiqc/multiqc_report.html", + stats="results/modern/vcf/" + REF_NAME + "/stats/vcf_biallelic_missing_{chr}/multiqc/multiqc_report.html", params: - indir="results/modern/vcf/" + REF_NAME + "/stats/", - outdir="results/modern/vcf/" + REF_NAME + "/stats/multiqc", + indir="results/modern/vcf/" + REF_NAME + "/stats/vcf_biallelic_missing_{chr}/", + outdir="results/modern/vcf/" + REF_NAME + "/stats/vcf_biallelic_missing_{chr}/multiqc", log: "results/logs/9_merge_vcfs/modern/" + REF_NAME + "/biallelic_missing_filtered_vcf_multiqc.log", singularity: From 92f7aa92fc6d9db6f0e10ea511ae1e7614146269 Mon Sep 17 00:00:00 2001 From: verku Date: Mon, 9 Oct 2023 10:38:39 +0200 Subject: [PATCH 27/49] Add missing wildcard to avoid that two or more jobs write to the same wildcard --- workflow/rules/9_merge_vcfs.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/rules/9_merge_vcfs.smk b/workflow/rules/9_merge_vcfs.smk index 4915b7c..15d246c 100644 --- a/workflow/rules/9_merge_vcfs.smk +++ b/workflow/rules/9_merge_vcfs.smk @@ -685,7 +685,7 @@ rule historical_biallelic_missing_filtered_vcf_multiqc: indir="results/historical/vcf/" + REF_NAME + "/stats/vcf_biallelic_missing_{chr}/", outdir="results/historical/vcf/" + REF_NAME + "/stats/vcf_biallelic_missing_{chr}/multiqc", log: - "results/logs/9_merge_vcfs/historical/" + REF_NAME + "/biallelic_missing_filtered_vcf_multiqc.log", + "results/logs/9_merge_vcfs/historical/" + REF_NAME + "/biallelic_missing_{chr}_filtered_vcf_multiqc.log", singularity: "docker://quay.io/biocontainers/multiqc:1.9--pyh9f0ad1d_0" shell: @@ -704,7 +704,7 @@ rule modern_biallelic_missing_filtered_vcf_multiqc: indir="results/modern/vcf/" + REF_NAME + "/stats/vcf_biallelic_missing_{chr}/", outdir="results/modern/vcf/" + REF_NAME + "/stats/vcf_biallelic_missing_{chr}/multiqc", log: - "results/logs/9_merge_vcfs/modern/" + REF_NAME + "/biallelic_missing_filtered_vcf_multiqc.log", + "results/logs/9_merge_vcfs/modern/" + REF_NAME + "/biallelic_missing_{chr}_filtered_vcf_multiqc.log", singularity: "docker://quay.io/biocontainers/multiqc:1.9--pyh9f0ad1d_0" shell: From 4c490984fc31ce8990151b72892e0cad19e939af Mon Sep 17 00:00:00 2001 From: verku Date: Mon, 9 Oct 2023 11:39:02 +0200 Subject: [PATCH 28/49] Remove output files from all_outputs list that do not exist anymore --- workflow/rules/12_snpEff.smk | 4 ---- workflow/rules/13_GERP.smk | 4 ---- 2 files changed, 8 deletions(-) diff --git a/workflow/rules/12_snpEff.smk b/workflow/rules/12_snpEff.smk index 38ee804..6fd74b5 100644 --- a/workflow/rules/12_snpEff.smk +++ b/workflow/rules/12_snpEff.smk @@ -4,8 +4,6 @@ # Code collecting output files from this part of the pipeline if os.path.exists(config["historical_samples"]) and os.path.exists(config["modern_samples"]): - all_outputs.append(expand("results/{dataset}/vcf/" + REF_NAME + "/stats/multiqc/multiqc_report.html", - dataset=["historical", "modern"])) all_outputs.append(expand("results/all/snpEff/" + REF_NAME + ".all.fmissing{fmiss}.{chr}.snpEff_variant_impact_plot.pdf", fmiss=config["f_missing"], chr=CHR,)) @@ -13,14 +11,12 @@ if os.path.exists(config["historical_samples"]) and os.path.exists(config["moder all_outputs.append("results/modern/snpEff/" + REF_NAME + "/multiqc/multiqc_report.html") elif os.path.exists(config["historical_samples"]): - all_outputs.append("results/historical/vcf/" + REF_NAME + "/stats/multiqc/multiqc_report.html") all_outputs.append(expand("results/historical/snpEff/" + REF_NAME + ".historical.fmissing{fmiss}.{chr}.snpEff_variant_impact_plot.pdf", fmiss=config["f_missing"], chr=CHR,)) all_outputs.append("results/historical/snpEff/" + REF_NAME + "/multiqc/multiqc_report.html") elif os.path.exists(config["modern_samples"]): - all_outputs.append("results/modern/vcf/" + REF_NAME + "/stats/multiqc/multiqc_report.html") all_outputs.append(expand("results/modern/snpEff/" + REF_NAME + ".modern.fmissing{fmiss}.{chr}.snpEff_variant_impact_plot.pdf", fmiss=config["f_missing"], chr=CHR,)) diff --git a/workflow/rules/13_GERP.smk b/workflow/rules/13_GERP.smk index 42ae458..78d2afc 100644 --- a/workflow/rules/13_GERP.smk +++ b/workflow/rules/13_GERP.smk @@ -7,8 +7,6 @@ all_outputs.append(expand("results/gerp/" + REF_NAME + ".{chr}.ancestral.rates.g chr=CHR,)) if os.path.exists(config["historical_samples"]) and os.path.exists(config["modern_samples"]): - all_outputs.append(expand("results/{dataset}/vcf/" + REF_NAME + "/stats/multiqc/multiqc_report.html", - dataset=["historical", "modern"])) all_outputs.append(expand("results/gerp/all/" + REF_NAME + ".all.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_plot.pdf", fmiss=config["f_missing"], chr=CHR, @@ -16,7 +14,6 @@ if os.path.exists(config["historical_samples"]) and os.path.exists(config["moder maxGERP=config["max_gerp"],)) elif os.path.exists(config["historical_samples"]): - all_outputs.append("results/historical/vcf/" + REF_NAME + "/stats/multiqc/multiqc_report.html") all_outputs.append(expand("results/gerp/historical/" + REF_NAME + ".historical.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_plot.pdf", fmiss=config["f_missing"], chr=CHR, @@ -24,7 +21,6 @@ elif os.path.exists(config["historical_samples"]): maxGERP=config["max_gerp"],)) elif os.path.exists(config["modern_samples"]): - all_outputs.append("results/modern/vcf/" + REF_NAME + "/stats/multiqc/multiqc_report.html") all_outputs.append(expand("results/gerp/modern/" + REF_NAME + ".modern.fmissing{fmiss}.{chr}.relative_mutational_load.gerp_{minGERP}_{maxGERP}_plot.pdf", fmiss=config["f_missing"], chr=CHR, From b25501fffec8d19cb6eece79b37afc98bac1a364 Mon Sep 17 00:00:00 2001 From: verku Date: Tue, 10 Oct 2023 09:19:17 +0200 Subject: [PATCH 29/49] Describe average depth calculation more in detail --- .test/config/config_mitogenomes.yaml | 9 ++++++--- .test/config/config_mlRho_options.yaml | 9 ++++++--- .test/config/config_pca_roh.yaml | 9 ++++++--- .test/config/config_snpeff_gerp.yaml | 9 ++++++--- config/config.yaml | 9 ++++++--- 5 files changed, 30 insertions(+), 15 deletions(-) diff --git a/.test/config/config_mitogenomes.yaml b/.test/config/config_mitogenomes.yaml index 2256053..3225dfa 100644 --- a/.test/config/config_mitogenomes.yaml +++ b/.test/config/config_mitogenomes.yaml @@ -120,9 +120,12 @@ bam_rmdup_realign_indels: False # Parameters related to depth filtering of BAM and VCF files. # After BAM file processing, the average genome-wide depth is calculated # per sample, from which minimum and maximum depth thresholds for quality -# filtering are determined. -# In the calculation of the average genome-wide depth of coverage, -# sites with missing data (i.e. zero coverage) can be included or excluded. +# filtering are determined. +# Sites with mapping quality < 30 or in repeat elements are excluded +# by default from the calculation of the average genome-wide depth +# of coverage. +# Sites with missing data (i.e. zero coverage) can be included or +# excluded in the average depth calculation. # Set to True if sites with missing data (zero coverage) should be # included in the average depth calculation. # Set to False if sites with missing data (zero coverage) should be diff --git a/.test/config/config_mlRho_options.yaml b/.test/config/config_mlRho_options.yaml index 3857fef..aca8842 100644 --- a/.test/config/config_mlRho_options.yaml +++ b/.test/config/config_mlRho_options.yaml @@ -120,9 +120,12 @@ bam_rmdup_realign_indels: False # Parameters related to depth filtering of BAM and VCF files. # After BAM file processing, the average genome-wide depth is calculated # per sample, from which minimum and maximum depth thresholds for quality -# filtering are determined. -# In the calculation of the average genome-wide depth of coverage, -# sites with missing data (i.e. zero coverage) can be included or excluded. +# filtering are determined. +# Sites with mapping quality < 30 or in repeat elements are excluded +# by default from the calculation of the average genome-wide depth +# of coverage. +# Sites with missing data (i.e. zero coverage) can be included or +# excluded in the average depth calculation. # Set to True if sites with missing data (zero coverage) should be # included in the average depth calculation. # Set to False if sites with missing data (zero coverage) should be diff --git a/.test/config/config_pca_roh.yaml b/.test/config/config_pca_roh.yaml index f366271..70750ba 100644 --- a/.test/config/config_pca_roh.yaml +++ b/.test/config/config_pca_roh.yaml @@ -120,9 +120,12 @@ bam_rmdup_realign_indels: False # Parameters related to depth filtering of BAM and VCF files. # After BAM file processing, the average genome-wide depth is calculated # per sample, from which minimum and maximum depth thresholds for quality -# filtering are determined. -# In the calculation of the average genome-wide depth of coverage, -# sites with missing data (i.e. zero coverage) can be included or excluded. +# filtering are determined. +# Sites with mapping quality < 30 or in repeat elements are excluded +# by default from the calculation of the average genome-wide depth +# of coverage. +# Sites with missing data (i.e. zero coverage) can be included or +# excluded in the average depth calculation. # Set to True if sites with missing data (zero coverage) should be # included in the average depth calculation. # Set to False if sites with missing data (zero coverage) should be diff --git a/.test/config/config_snpeff_gerp.yaml b/.test/config/config_snpeff_gerp.yaml index 4b0f2f5..20f9dca 100644 --- a/.test/config/config_snpeff_gerp.yaml +++ b/.test/config/config_snpeff_gerp.yaml @@ -120,9 +120,12 @@ bam_rmdup_realign_indels: False # Parameters related to depth filtering of BAM and VCF files. # After BAM file processing, the average genome-wide depth is calculated # per sample, from which minimum and maximum depth thresholds for quality -# filtering are determined. -# In the calculation of the average genome-wide depth of coverage, -# sites with missing data (i.e. zero coverage) can be included or excluded. +# filtering are determined. +# Sites with mapping quality < 30 or in repeat elements are excluded +# by default from the calculation of the average genome-wide depth +# of coverage. +# Sites with missing data (i.e. zero coverage) can be included or +# excluded in the average depth calculation. # Set to True if sites with missing data (zero coverage) should be # included in the average depth calculation. # Set to False if sites with missing data (zero coverage) should be diff --git a/config/config.yaml b/config/config.yaml index aa21d74..04ff69c 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -120,9 +120,12 @@ bam_rmdup_realign_indels: False # Parameters related to depth filtering of BAM and VCF files. # After BAM file processing, the average genome-wide depth is calculated # per sample, from which minimum and maximum depth thresholds for quality -# filtering are determined. -# In the calculation of the average genome-wide depth of coverage, -# sites with missing data (i.e. zero coverage) can be included or excluded. +# filtering are determined. +# Sites with mapping quality < 30 or in repeat elements are excluded +# by default from the calculation of the average genome-wide depth +# of coverage. +# Sites with missing data (i.e. zero coverage) can be included or +# excluded in the average depth calculation. # Set to True if sites with missing data (zero coverage) should be # included in the average depth calculation. # Set to False if sites with missing data (zero coverage) should be From 73dfa575f7ab220ff0905d4e4bbabf02c0712e19 Mon Sep 17 00:00:00 2001 From: verku Date: Wed, 11 Oct 2023 09:38:01 +0200 Subject: [PATCH 30/49] Update script to merge mlRho results for genome-wide analyses to match updated file names --- workflow/scripts/mlRho_table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/scripts/mlRho_table.py b/workflow/scripts/mlRho_table.py index 9237484..ea7f066 100644 --- a/workflow/scripts/mlRho_table.py +++ b/workflow/scripts/mlRho_table.py @@ -22,7 +22,7 @@ def mlRho_dataframe(inputfiles): dataset = path_list[1] sample = path_list[4].split(".")[0] genomeregion = path_list[4].split(".")[-3] - if genomeregion == "all": + if genomeregion == "genome": genomeregion = "genomewide" elif genomeregion == "autos": genomeregion = "autosomes" From 1fdd48df2395127304401363c86ffbb59e078858 Mon Sep 17 00:00:00 2001 From: verku Date: Thu, 12 Oct 2023 10:12:04 +0200 Subject: [PATCH 31/49] Add runtime to rules in slurm config file that was missing --- config/slurm/cluster.yaml | 2 +- config/slurm/profile/config.yaml | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/config/slurm/cluster.yaml b/config/slurm/cluster.yaml index 0178888..509cb1e 100644 --- a/config/slurm/cluster.yaml +++ b/config/slurm/cluster.yaml @@ -186,7 +186,7 @@ filter_vcfs_qual_dp: time: 05:00:00 cpus-per-task: 2 filter_vcfs_allelic_balance: - time: 05:00:00 + time: 1-00:00:00 cpus-per-task: 2 remove_repeats_vcf: time: 05:00:00 diff --git a/config/slurm/profile/config.yaml b/config/slurm/profile/config.yaml index 46ecaf1..26eddf9 100644 --- a/config/slurm/profile/config.yaml +++ b/config/slurm/profile/config.yaml @@ -207,22 +207,33 @@ set-resources: - make_noCpG_repma_bed:mem_mb=12800 ### create chromosome bed files - make_autosomes_bed:runtime=300 + - intersect_sexchr_repma_beds:runtime=300 - intersect_sexchr_repma_beds:mem_mb=12800 + - intersect_autos_repma_beds:runtime=300 - intersect_autos_repma_beds:mem_mb=12800 + - intersect_sexchr_CpG_repma_beds:runtime=300 - intersect_sexchr_CpG_repma_beds:mem_mb=12800 + - intersect_autos_CpG_repma_beds:runtime=300 - intersect_autos_CpG_repma_beds:mem_mb=12800 ### mlRho - bam2pro_autos:runtime=1440 - bam2pro_sexchr:runtime=1440 - bam2pro_all:runtime=1440 ### CpG filter VCF files + - remove_CpG_vcf:runtime=300 - remove_CpG_vcf:mem_mb=38400 + - CpG_vcf2bcf:runtime=300 - CpG_vcf2bcf:mem_mb=12800 ### process VCF files + - remove_snps_near_indels:runtime=300 - remove_snps_near_indels:mem_mb=12800 + - filter_vcfs_qual_dp:runtime=300 - filter_vcfs_qual_dp:mem_mb=12800 + - filter_vcfs_allelic_balance:runtime=300 - filter_vcfs_allelic_balance:mem_mb=12800 + - remove_repeats_vcf:runtime=300 - remove_repeats_vcf:mem_mb=38400 + - filtered_vcf2bcf:runtime=300 - filtered_vcf2bcf:mem_mb=12800 ### merge and process VCF files - merge_all_vcfs:runtime=4320 From 90bf991dc7d440b821cb607fd7158b241af388e1 Mon Sep 17 00:00:00 2001 From: verku Date: Thu, 12 Oct 2023 10:42:37 +0200 Subject: [PATCH 32/49] Make sure missingness_filtered_vcf_multiqc is also run on historical or modern samples only --- workflow/rules/9_merge_vcfs.smk | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/workflow/rules/9_merge_vcfs.smk b/workflow/rules/9_merge_vcfs.smk index 56bed87..44c9a3b 100644 --- a/workflow/rules/9_merge_vcfs.smk +++ b/workflow/rules/9_merge_vcfs.smk @@ -2,7 +2,13 @@ ### 9. Merge VCF files # Code collecting output files from this part of the pipeline -all_outputs.append("results/all/vcf/" + REF_NAME + "/stats/vcf_merged_missing/multiqc/multiqc_report.html") +if os.path.exists(config["historical_samples"]) and os.path.exists(config["modern_samples"]): + all_outputs.append("results/{dataset}/vcf/" + REF_NAME + "/stats/vcf_merged_missing/multiqc/multiqc_report.html", + dataset=["all", "historical", "modern"],) +elif os.path.exists(config["historical_samples"]): + all_outputs.append("results/historical/vcf/" + REF_NAME + "/stats/vcf_merged_missing/multiqc/multiqc_report.html") +elif os.path.exists(config["modern_samples"]): + all_outputs.append("results/modern/vcf/" + REF_NAME + "/stats/vcf_merged_missing/multiqc/multiqc_report.html") # Functions used by rules of this part of the pipeline @@ -450,12 +456,12 @@ rule missingness_filtered_vcf_multiqc: input: missingness_filtered_vcf_multiqc_inputs, output: - stats="results/all/vcf/" + REF_NAME + "/stats/vcf_merged_missing/multiqc/multiqc_report.html", + stats="results/{dataset}/vcf/" + REF_NAME + "/stats/vcf_merged_missing/multiqc/multiqc_report.html", params: - indir="results/all/vcf/" + REF_NAME + "/stats/vcf_merged_missing/", - outdir="results/all/vcf/" + REF_NAME + "/stats/vcf_merged_missing/multiqc", + indir="results/{dataset}/vcf/" + REF_NAME + "/stats/vcf_merged_missing/", + outdir="results/{dataset}/vcf/" + REF_NAME + "/stats/vcf_merged_missing/multiqc", log: - "results/logs/9_merge_vcfs/all/" + REF_NAME + "/missingness_filtered_vcf_multiqc.log", + "results/logs/9_merge_vcfs/{dataset}/" + REF_NAME + "/missingness_filtered_vcf_multiqc.log", singularity: "docker://quay.io/biocontainers/multiqc:1.9--pyh9f0ad1d_0" shell: From feaa78f5098c55268fadc4a62c6eb6f47991f57d Mon Sep 17 00:00:00 2001 From: verku Date: Thu, 12 Oct 2023 11:22:43 +0200 Subject: [PATCH 33/49] Add missing expand() function to collect output files from multiqc --- workflow/rules/9_merge_vcfs.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/rules/9_merge_vcfs.smk b/workflow/rules/9_merge_vcfs.smk index 44c9a3b..c5880cc 100644 --- a/workflow/rules/9_merge_vcfs.smk +++ b/workflow/rules/9_merge_vcfs.smk @@ -3,8 +3,8 @@ # Code collecting output files from this part of the pipeline if os.path.exists(config["historical_samples"]) and os.path.exists(config["modern_samples"]): - all_outputs.append("results/{dataset}/vcf/" + REF_NAME + "/stats/vcf_merged_missing/multiqc/multiqc_report.html", - dataset=["all", "historical", "modern"],) + all_outputs.append(expand("results/{dataset}/vcf/" + REF_NAME + "/stats/vcf_merged_missing/multiqc/multiqc_report.html", + dataset=["all", "historical", "modern"],)) elif os.path.exists(config["historical_samples"]): all_outputs.append("results/historical/vcf/" + REF_NAME + "/stats/vcf_merged_missing/multiqc/multiqc_report.html") elif os.path.exists(config["modern_samples"]): From 303528692ca1fd631a38d8ee26b08d8e855b7f31 Mon Sep 17 00:00:00 2001 From: verku Date: Fri, 13 Oct 2023 09:56:30 +0200 Subject: [PATCH 34/49] Add MultiQC files for merged and filtered VCF files to the pipeline report --- Snakefile | 1 + workflow/report/missingness_filtered_vcf_multiqc.rst | 1 + workflow/rules/9_merge_vcfs.smk | 5 ++++- 3 files changed, 6 insertions(+), 1 deletion(-) create mode 100644 workflow/report/missingness_filtered_vcf_multiqc.rst diff --git a/Snakefile b/Snakefile index 8a1cf44..9296446 100644 --- a/Snakefile +++ b/Snakefile @@ -309,6 +309,7 @@ rule all: if (config["bam_rmdup_realign_indels"] + or config["merge_vcfs_per_dataset"] or config["mlRho"] or config["pca"] or config["ROH"] diff --git a/workflow/report/missingness_filtered_vcf_multiqc.rst b/workflow/report/missingness_filtered_vcf_multiqc.rst new file mode 100644 index 0000000..08f02d3 --- /dev/null +++ b/workflow/report/missingness_filtered_vcf_multiqc.rst @@ -0,0 +1 @@ +MultiQC report, summarizing statistics for merged and filtered VCF files including {{ snakemake.wildcards.dataset }} samples. \ No newline at end of file diff --git a/workflow/rules/9_merge_vcfs.smk b/workflow/rules/9_merge_vcfs.smk index 7a998a3..3766baf 100644 --- a/workflow/rules/9_merge_vcfs.smk +++ b/workflow/rules/9_merge_vcfs.smk @@ -612,7 +612,10 @@ rule missingness_filtered_vcf_multiqc: input: missingness_filtered_vcf_multiqc_inputs, output: - stats="results/{dataset}/vcf/" + REF_NAME + "/stats/vcf_merged_missing/multiqc/multiqc_report.html", + stats=report( + "results/{dataset}/vcf/" + REF_NAME + "/stats/vcf_merged_missing/multiqc/multiqc_report.html", + caption="../report/missingness_filtered_vcf_multiqc.rst", + category="VCF file processing",), params: indir="results/{dataset}/vcf/" + REF_NAME + "/stats/vcf_merged_missing/", outdir="results/{dataset}/vcf/" + REF_NAME + "/stats/vcf_merged_missing/multiqc", From c7746fb915347e5af2025e3911f209034d557244 Mon Sep 17 00:00:00 2001 From: verku Date: Tue, 17 Oct 2023 11:35:19 +0200 Subject: [PATCH 35/49] Update repeatmodeler to version 2.0.4 to handle large genomes --- workflow/rules/0.2_repeat_identification.smk | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/workflow/rules/0.2_repeat_identification.smk b/workflow/rules/0.2_repeat_identification.smk index e578212..e5a84b7 100644 --- a/workflow/rules/0.2_repeat_identification.smk +++ b/workflow/rules/0.2_repeat_identification.smk @@ -39,7 +39,7 @@ rule cp_repeatmasker_libs: log: "results/logs/0.2_repeat_identification/" + REF_NAME + "_cp_repeatmasker_libs.log", singularity: - "docker://quay.io/biocontainers/repeatmodeler:2.0.1--pl526_0" + "docker://quay.io/biocontainers/repeatmodeler:2.0.4--pl5321hdfd78af_0" shell: """ cp /usr/local/share/RepeatMasker/Libraries/* workflow/resources/RepeatMasker/Libraries/ 2> {log} @@ -75,7 +75,7 @@ rule make_repma_blast_db: log: os.path.abspath("results/logs/0.2_repeat_identification/" + REF_NAME + "_make_repma_blast_db.log"), singularity: - "docker://quay.io/biocontainers/repeatmodeler:2.0.1--pl526_0" + "docker://quay.io/biocontainers/repeatmodeler:2.0.4--pl5321hdfd78af_0" shell: """ cd {params.dir} @@ -99,7 +99,7 @@ rule repeatmodeler: os.path.abspath("results/logs/0.2_repeat_identification/" + REF_NAME + "_repeatmodeler.log"), threads: 16 singularity: - "docker://quay.io/biocontainers/repeatmodeler:2.0.1--pl526_0" + "docker://quay.io/biocontainers/repeatmodeler:2.0.4--pl5321hdfd78af_0" shell: """ cd {params.dir} @@ -139,7 +139,7 @@ rule repeatclassifier: "results/logs/0.2_repeat_identification/" + REF_NAME + "_repeatclassifier.log", threads: 2 singularity: - "docker://quay.io/biocontainers/repeatmodeler:2.0.1--pl526_0" + "docker://quay.io/biocontainers/repeatmodeler:2.0.4--pl5321hdfd78af_0" shell: """ RepeatClassifier -repeatmasker_dir {params.repma_dir} -consensi {input.repmo} -stockholm {input.stk} 2> {log} From 202889b33144ebbad89fd6f4724c04622ca4490c Mon Sep 17 00:00:00 2001 From: verku Date: Tue, 17 Oct 2023 12:50:18 +0200 Subject: [PATCH 36/49] Add Github action Free Disk Space (Ubuntu) to free some disk space between setting up singularity and conda and running GenErode --- .github/workflows/gerp.yaml | 13 +++++++++++++ .github/workflows/mitogenome_mapping.yaml | 13 +++++++++++++ .github/workflows/mlRho_options.yaml | 13 +++++++++++++ .github/workflows/pca_roh.yaml | 13 +++++++++++++ .github/workflows/snpeff.yaml | 13 +++++++++++++ 5 files changed, 65 insertions(+) diff --git a/.github/workflows/gerp.yaml b/.github/workflows/gerp.yaml index d87669b..2051d7b 100644 --- a/.github/workflows/gerp.yaml +++ b/.github/workflows/gerp.yaml @@ -51,6 +51,19 @@ jobs: conda info conda list + - name: Free Up GitHub Actions Ubuntu Runner Disk Space 🔧 + uses: jlumbroso/free-disk-space@main + with: + # This might remove tools that are actually needed, if set to "true" but frees about 6 GB + tool-cache: false + + # All of these default to true, but feel free to set to "false" if necessary for your workflow + android: true + dotnet: true + haskell: true + large-packages: true + swap-storage: true + - name: gerp_dry shell: bash -l {0} run: | diff --git a/.github/workflows/mitogenome_mapping.yaml b/.github/workflows/mitogenome_mapping.yaml index 0d3d257..b5f7992 100644 --- a/.github/workflows/mitogenome_mapping.yaml +++ b/.github/workflows/mitogenome_mapping.yaml @@ -75,6 +75,19 @@ jobs: conda info conda list + - name: Free Up GitHub Actions Ubuntu Runner Disk Space 🔧 + uses: jlumbroso/free-disk-space@main + with: + # This might remove tools that are actually needed, if set to "true" but frees about 6 GB + tool-cache: false + + # All of these default to true, but feel free to set to "false" if necessary for your workflow + android: true + dotnet: true + haskell: true + large-packages: true + swap-storage: true + - name: mitogenome_mapping_dry shell: bash -l {0} run: | diff --git a/.github/workflows/mlRho_options.yaml b/.github/workflows/mlRho_options.yaml index 22f8c27..f245c48 100644 --- a/.github/workflows/mlRho_options.yaml +++ b/.github/workflows/mlRho_options.yaml @@ -57,6 +57,19 @@ jobs: conda info conda list + - name: Free Up GitHub Actions Ubuntu Runner Disk Space 🔧 + uses: jlumbroso/free-disk-space@main + with: + # This might remove tools that are actually needed, if set to "true" but frees about 6 GB + tool-cache: false + + # All of these default to true, but feel free to set to "false" if necessary for your workflow + android: true + dotnet: true + haskell: true + large-packages: true + swap-storage: true + - name: mlRho_options_dry shell: bash -l {0} run: | diff --git a/.github/workflows/pca_roh.yaml b/.github/workflows/pca_roh.yaml index b9788fa..a27d99b 100644 --- a/.github/workflows/pca_roh.yaml +++ b/.github/workflows/pca_roh.yaml @@ -49,6 +49,19 @@ jobs: conda info conda list + - name: Free Up GitHub Actions Ubuntu Runner Disk Space 🔧 + uses: jlumbroso/free-disk-space@main + with: + # This might remove tools that are actually needed, if set to "true" but frees about 6 GB + tool-cache: false + + # All of these default to true, but feel free to set to "false" if necessary for your workflow + android: true + dotnet: true + haskell: true + large-packages: true + swap-storage: true + - name: pca_roh_dry shell: bash -l {0} run: | diff --git a/.github/workflows/snpeff.yaml b/.github/workflows/snpeff.yaml index 5660760..5e656c5 100644 --- a/.github/workflows/snpeff.yaml +++ b/.github/workflows/snpeff.yaml @@ -51,6 +51,19 @@ jobs: conda info conda list + - name: Free Up GitHub Actions Ubuntu Runner Disk Space 🔧 + uses: jlumbroso/free-disk-space@main + with: + # This might remove tools that are actually needed, if set to "true" but frees about 6 GB + tool-cache: false + + # All of these default to true, but feel free to set to "false" if necessary for your workflow + android: true + dotnet: true + haskell: true + large-packages: true + swap-storage: true + - name: snpeff_dry shell: bash -l {0} run: | From 1643221bc502723110f1352bf41ba53a8eb680ac Mon Sep 17 00:00:00 2001 From: verku Date: Tue, 17 Oct 2023 16:46:59 +0200 Subject: [PATCH 37/49] Copy the repeat library from the updated repeatmodeler container --- workflow/rules/0.2_repeat_identification.smk | 60 ++------------------ 1 file changed, 5 insertions(+), 55 deletions(-) diff --git a/workflow/rules/0.2_repeat_identification.smk b/workflow/rules/0.2_repeat_identification.smk index e5a84b7..08e2ef8 100644 --- a/workflow/rules/0.2_repeat_identification.smk +++ b/workflow/rules/0.2_repeat_identification.smk @@ -21,65 +21,20 @@ rule ref_upper: """ -rule cp_repeatmasker_libs: - """Copy RepeatMasker libraries from container""" - output: - art=temp("workflow/resources/RepeatMasker/Libraries/Artefacts.embl"), - embl=temp("workflow/resources/RepeatMasker/Libraries/Dfam.embl"), - hmm=temp("workflow/resources/RepeatMasker/Libraries/Dfam.hmm"), - repann=temp("workflow/resources/RepeatMasker/Libraries/RepeatAnnotationData.pm"), - phr=temp("workflow/resources/RepeatMasker/Libraries/RepeatPeps.lib.phr"), - psq=temp("workflow/resources/RepeatMasker/Libraries/RepeatPeps.lib.psq"), - lib=temp("workflow/resources/RepeatMasker/Libraries/RepeatPeps.lib"), - pin=temp("workflow/resources/RepeatMasker/Libraries/RepeatPeps.lib.pin"), - peprm=temp("workflow/resources/RepeatMasker/Libraries/RepeatPeps.readme"), - meta=temp("workflow/resources/RepeatMasker/Libraries/RMRBMeta.embl"), - rm=temp("workflow/resources/RepeatMasker/Libraries/README.meta"), - tax=temp("workflow/resources/RepeatMasker/Libraries/taxonomy.dat"), - log: - "results/logs/0.2_repeat_identification/" + REF_NAME + "_cp_repeatmasker_libs.log", - singularity: - "docker://quay.io/biocontainers/repeatmodeler:2.0.4--pl5321hdfd78af_0" - shell: - """ - cp /usr/local/share/RepeatMasker/Libraries/* workflow/resources/RepeatMasker/Libraries/ 2> {log} - """ - - -rule embl2fasta: - """Convert Dfam embl to fasta format""" - input: - dfam_embl=rules.cp_repeatmasker_libs.output.embl, +rule cp_repeatmasker_lib: + """Copy RepeatMasker library from container""" output: rm_lib=temp("workflow/resources/RepeatMasker/Libraries/RepeatMasker.lib"), - log: - "results/logs/0.2_repeat_identification/" + REF_NAME + "_embl2fasta.log", - run: - from Bio import SeqIO - with open(input.dfam_embl, "rU") as input_handle, open(output.rm_lib, "w") as output_handle: - sequences = SeqIO.parse(input_handle, "embl") - count = SeqIO.write(sequences, output_handle, "fasta") - print("Converted %i records" % count) - - -rule make_repma_blast_db: - input: - rm_lib=rules.embl2fasta.output.rm_lib, - output: nhr=temp("workflow/resources/RepeatMasker/Libraries/RepeatMasker.lib.nhr"), nin=temp("workflow/resources/RepeatMasker/Libraries/RepeatMasker.lib.nin"), nsq=temp("workflow/resources/RepeatMasker/Libraries/RepeatMasker.lib.nsq"), - params: - dir="workflow/resources/RepeatMasker/Libraries/", - rm_lib="RepeatMasker.lib", log: - os.path.abspath("results/logs/0.2_repeat_identification/" + REF_NAME + "_make_repma_blast_db.log"), + "results/logs/0.2_repeat_identification/" + REF_NAME + "_cp_repeatmasker_libs.log", singularity: "docker://quay.io/biocontainers/repeatmodeler:2.0.4--pl5321hdfd78af_0" shell: """ - cd {params.dir} - makeblastdb -dbtype nucl -in {params.rm_lib} 2> {log} + cp /usr/local/share/RepeatMasker/Libraries/RepeatMasker.lib* {output.rm_lib} 2> {log} """ @@ -127,8 +82,6 @@ rule repeatclassifier: input: repmo=rules.repeatmodeler.output.repmo, stk=rules.repeatmodeler.output.stk, - rm_lib=rules.embl2fasta.output.rm_lib, - rm_db=rules.make_repma_blast_db.output, rm_libs=rules.cp_repeatmasker_libs.output, output: repmo=REF_DIR + "/repeatmodeler/" + REF_NAME + "/RM_raw.out/consensi.fa.classified", @@ -151,8 +104,6 @@ rule repeatmasker: input: ref_upper=rules.ref_upper.output, repmo=rules.repeatclassifier.output.repmo, - rm_lib=rules.embl2fasta.output.rm_lib, - rm_db=rules.make_repma_blast_db.output, rm_libs=rules.cp_repeatmasker_libs.output, output: rep_masked=REF_DIR + "/repeatmasker/" + REF_NAME + "/" + REF_NAME + ".upper.fasta.masked", @@ -169,10 +120,9 @@ rule repeatmasker: os.path.abspath("results/logs/0.2_repeat_identification/" + REF_NAME + "_repeatmasker.log"), threads: 16 singularity: - "docker://quay.io/biocontainers/repeatmasker:4.0.9_p2--pl526_2" + "docker://quay.io/biocontainers/repeatmodeler:2.0.4--pl5321hdfd78af_0" shell: """ - export REPEATMASKER_LIB_DIR=$PWD/workflow/resources/RepeatMasker/Libraries && cd {params.dir} && RepeatMasker -pa {threads} -a -xsmall -gccalc -dir ./ -lib {params.repmo} {params.ref_upper} 2> {log} && From f518fd6620e732aac69f27fe130b3e011ee8a553 Mon Sep 17 00:00:00 2001 From: verku Date: Tue, 17 Oct 2023 16:49:33 +0200 Subject: [PATCH 38/49] Remove parameter pointing to RepeatMasker installation that wrongly pointed to RepeatMasker libraries --- workflow/rules/0.2_repeat_identification.smk | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/workflow/rules/0.2_repeat_identification.smk b/workflow/rules/0.2_repeat_identification.smk index 08e2ef8..a6583bb 100644 --- a/workflow/rules/0.2_repeat_identification.smk +++ b/workflow/rules/0.2_repeat_identification.smk @@ -82,12 +82,9 @@ rule repeatclassifier: input: repmo=rules.repeatmodeler.output.repmo, stk=rules.repeatmodeler.output.stk, - rm_libs=rules.cp_repeatmasker_libs.output, output: repmo=REF_DIR + "/repeatmodeler/" + REF_NAME + "/RM_raw.out/consensi.fa.classified", stk=REF_DIR + "/repeatmodeler/" + REF_NAME + "/RM_raw.out/families-classified.stk", - params: - repma_dir="workflow/resources/RepeatMasker", log: "results/logs/0.2_repeat_identification/" + REF_NAME + "_repeatclassifier.log", threads: 2 @@ -95,7 +92,7 @@ rule repeatclassifier: "docker://quay.io/biocontainers/repeatmodeler:2.0.4--pl5321hdfd78af_0" shell: """ - RepeatClassifier -repeatmasker_dir {params.repma_dir} -consensi {input.repmo} -stockholm {input.stk} 2> {log} + RepeatClassifier -consensi {input.repmo} -stockholm {input.stk} 2> {log} """ From 8d0aab5a8f3fd31b743727b987a0c7b253a5cc3d Mon Sep 17 00:00:00 2001 From: verku Date: Tue, 17 Oct 2023 16:50:48 +0200 Subject: [PATCH 39/49] Remove rule that copies RepeatMasker library as this does not seem to be required in the new container --- workflow/rules/0.2_repeat_identification.smk | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/workflow/rules/0.2_repeat_identification.smk b/workflow/rules/0.2_repeat_identification.smk index a6583bb..4cc101d 100644 --- a/workflow/rules/0.2_repeat_identification.smk +++ b/workflow/rules/0.2_repeat_identification.smk @@ -21,23 +21,6 @@ rule ref_upper: """ -rule cp_repeatmasker_lib: - """Copy RepeatMasker library from container""" - output: - rm_lib=temp("workflow/resources/RepeatMasker/Libraries/RepeatMasker.lib"), - nhr=temp("workflow/resources/RepeatMasker/Libraries/RepeatMasker.lib.nhr"), - nin=temp("workflow/resources/RepeatMasker/Libraries/RepeatMasker.lib.nin"), - nsq=temp("workflow/resources/RepeatMasker/Libraries/RepeatMasker.lib.nsq"), - log: - "results/logs/0.2_repeat_identification/" + REF_NAME + "_cp_repeatmasker_libs.log", - singularity: - "docker://quay.io/biocontainers/repeatmodeler:2.0.4--pl5321hdfd78af_0" - shell: - """ - cp /usr/local/share/RepeatMasker/Libraries/RepeatMasker.lib* {output.rm_lib} 2> {log} - """ - - rule repeatmodeler: """RepeatModeler for de novo repeat prediction from a reference assembly""" input: @@ -101,7 +84,6 @@ rule repeatmasker: input: ref_upper=rules.ref_upper.output, repmo=rules.repeatclassifier.output.repmo, - rm_libs=rules.cp_repeatmasker_libs.output, output: rep_masked=REF_DIR + "/repeatmasker/" + REF_NAME + "/" + REF_NAME + ".upper.fasta.masked", rep_align=REF_DIR + "/repeatmasker/" + REF_NAME + "/" + REF_NAME + ".upper.fasta.align", From 01203816fa7436e19188e37669420f51e063ddc8 Mon Sep 17 00:00:00 2001 From: verku Date: Tue, 17 Oct 2023 16:58:28 +0200 Subject: [PATCH 40/49] Remove repeatclassifier rule to test if the final output is generated by the latest repeatmodeler version --- workflow/rules/0.2_repeat_identification.smk | 29 ++++---------------- 1 file changed, 5 insertions(+), 24 deletions(-) diff --git a/workflow/rules/0.2_repeat_identification.smk b/workflow/rules/0.2_repeat_identification.smk index 4cc101d..c4fd741 100644 --- a/workflow/rules/0.2_repeat_identification.smk +++ b/workflow/rules/0.2_repeat_identification.smk @@ -26,8 +26,8 @@ rule repeatmodeler: input: ref_upper=rules.ref_upper.output, output: - repmo=REF_DIR + "/repeatmodeler/" + REF_NAME + "/RM_raw.out/consensi.fa", - stk=REF_DIR + "/repeatmodeler/" + REF_NAME + "/RM_raw.out/families.stk", + repmo=REF_DIR + "/repeatmodeler/" + REF_NAME + "/RM_raw.out/consensi.fa.classified", + stk=REF_DIR + "/repeatmodeler/" + REF_NAME + "/RM_raw.out/families-classified.stk", params: dir=REF_DIR + "/repeatmodeler/" + REF_NAME + "/", name=REF_NAME, @@ -49,8 +49,8 @@ rule repeatmodeler: RepeatModeler -engine ncbi -pa {threads} -database {params.name} 2>> {log} && # copy the output files to a new directory - cp RM_*.*/consensi.fa RM_raw.out/ 2>> {log} && - cp RM_*.*/families.stk RM_raw.out/ 2>> {log} + cp RM_*.*/consensi.fa.classified RM_raw.out/ 2>> {log} && + cp RM_*.*/families-classified.stk RM_raw.out/ 2>> {log} # remove temporary file if [ -f {params.abs_tmp} ] @@ -60,30 +60,11 @@ rule repeatmodeler: """ -rule repeatclassifier: - """Create final RepeatModeler output files""" - input: - repmo=rules.repeatmodeler.output.repmo, - stk=rules.repeatmodeler.output.stk, - output: - repmo=REF_DIR + "/repeatmodeler/" + REF_NAME + "/RM_raw.out/consensi.fa.classified", - stk=REF_DIR + "/repeatmodeler/" + REF_NAME + "/RM_raw.out/families-classified.stk", - log: - "results/logs/0.2_repeat_identification/" + REF_NAME + "_repeatclassifier.log", - threads: 2 - singularity: - "docker://quay.io/biocontainers/repeatmodeler:2.0.4--pl5321hdfd78af_0" - shell: - """ - RepeatClassifier -consensi {input.repmo} -stockholm {input.stk} 2> {log} - """ - - rule repeatmasker: """Repeat mask the full genome assembly using raw de novo predicted repeats""" input: ref_upper=rules.ref_upper.output, - repmo=rules.repeatclassifier.output.repmo, + repmo=rules.repeatmodeler.output.repmo, output: rep_masked=REF_DIR + "/repeatmasker/" + REF_NAME + "/" + REF_NAME + ".upper.fasta.masked", rep_align=REF_DIR + "/repeatmasker/" + REF_NAME + "/" + REF_NAME + ".upper.fasta.align", From f61979da3fd6321867a0360cf1d09893b1a4b350 Mon Sep 17 00:00:00 2001 From: verku Date: Tue, 17 Oct 2023 17:00:32 +0200 Subject: [PATCH 41/49] Update repeatmodeler parameter according to container version --- workflow/rules/0.2_repeat_identification.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/0.2_repeat_identification.smk b/workflow/rules/0.2_repeat_identification.smk index c4fd741..42b1ca8 100644 --- a/workflow/rules/0.2_repeat_identification.smk +++ b/workflow/rules/0.2_repeat_identification.smk @@ -46,7 +46,7 @@ rule repeatmodeler: BuildDatabase -engine ncbi -name {params.name} {params.ref_upper} 2> {log} && # Run RepeatModeler - RepeatModeler -engine ncbi -pa {threads} -database {params.name} 2>> {log} && + RepeatModeler -engine ncbi -threads {threads} -database {params.name} 2>> {log} && # copy the output files to a new directory cp RM_*.*/consensi.fa.classified RM_raw.out/ 2>> {log} && From 0702bf00b5d6fadeb8195066df8378bd33363f1e Mon Sep 17 00:00:00 2001 From: verku Date: Wed, 25 Oct 2023 10:50:29 +0200 Subject: [PATCH 42/49] Remove temporary flag from merged vcf to avoid triggering a re-run of merging when testing different filtering levels for missingness --- workflow/rules/9_merge_vcfs.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/rules/9_merge_vcfs.smk b/workflow/rules/9_merge_vcfs.smk index 3766baf..be46e0a 100644 --- a/workflow/rules/9_merge_vcfs.smk +++ b/workflow/rules/9_merge_vcfs.smk @@ -310,7 +310,7 @@ rule merge_all_vcfs: bcf=merge_all_inputs, index=merge_all_index_inputs, output: - merged=temp("results/all/vcf/" + REF_NAME + ".all.merged.snps.bcf"), + merged="results/all/vcf/" + REF_NAME + ".all.merged.snps.bcf", threads: 6 log: "results/logs/9_merge_vcfs/" + REF_NAME + "_merge_all_vcfs.log", @@ -334,7 +334,7 @@ rule index_merged_vcf: input: bcf="results/all/vcf/" + REF_NAME + ".all.merged.snps.bcf", output: - index=temp("results/all/vcf/" + REF_NAME + ".all.merged.snps.bcf.csi"), + index="results/all/vcf/" + REF_NAME + ".all.merged.snps.bcf.csi", group: "merged_vcf_group" log: From 8447c9ba2f567d0eb8be6c4347d3235c10f14fbf Mon Sep 17 00:00:00 2001 From: verku Date: Wed, 25 Oct 2023 11:08:08 +0200 Subject: [PATCH 43/49] Remove copying the repeatmask bed file to reduce file number and triggering reruns of downstream rules --- workflow/rules/0.2_repeat_identification.smk | 4 +--- workflow/rules/6_autosome_sexchromosome_bed_files.smk | 8 ++++---- workflow/rules/8.2_vcf_qual_repeat_filtering.smk | 2 +- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/workflow/rules/0.2_repeat_identification.smk b/workflow/rules/0.2_repeat_identification.smk index 42b1ca8..010cc00 100644 --- a/workflow/rules/0.2_repeat_identification.smk +++ b/workflow/rules/0.2_repeat_identification.smk @@ -137,7 +137,6 @@ rule make_no_repeats_bed: sorted_rep_bed=rules.sort_repeats_bed.output.sorted_rep_bed, output: no_rep_bed=REF_DIR + "/" + REF_NAME + ".repma.bed", - no_rep_bed_dir="results/" + REF_NAME + ".repma.bed", group: "reference_group" log: @@ -146,6 +145,5 @@ rule make_no_repeats_bed: "docker://nbisweden/generode-bedtools-2.29.2" shell: """ - bedtools subtract -a {input.ref_bed} -b {input.sorted_rep_bed} > {output.no_rep_bed} 2> {log} && - cp {output.no_rep_bed} {output.no_rep_bed_dir} 2>> {log} + bedtools subtract -a {input.ref_bed} -b {input.sorted_rep_bed} > {output.no_rep_bed} 2> {log} """ diff --git a/workflow/rules/6_autosome_sexchromosome_bed_files.smk b/workflow/rules/6_autosome_sexchromosome_bed_files.smk index eda4d61..914d7ce 100644 --- a/workflow/rules/6_autosome_sexchromosome_bed_files.smk +++ b/workflow/rules/6_autosome_sexchromosome_bed_files.smk @@ -51,7 +51,7 @@ rule make_autosomes_bed: rule intersect_sexchr_repma_beds: input: - no_rep_bed_dir=rules.make_no_repeats_bed.output.no_rep_bed_dir, + no_rep_bed=rules.make_no_repeats_bed.output.no_rep_bed, sexchr_bed=rules.make_sexchr_bed.output, output: repma_sex_chr="results/" + REF_NAME + ".repma.sexchr.bed", @@ -64,13 +64,13 @@ rule intersect_sexchr_repma_beds: "docker://nbisweden/generode-bedtools-2.29.2" shell: """ - bedtools intersect -a {input.no_rep_bed_dir} -b {input.sexchr_bed} > {output.repma_sex_chr} 2> {log} + bedtools intersect -a {input.no_rep_bed} -b {input.sexchr_bed} > {output.repma_sex_chr} 2> {log} """ rule intersect_autos_repma_beds: input: - no_rep_bed_dir=rules.make_no_repeats_bed.output.no_rep_bed_dir, + no_rep_bed=rules.make_no_repeats_bed.output.no_rep_bed, autosome_bed=rules.make_autosomes_bed.output, output: repma_autos="results/" + REF_NAME + ".repma.autos.bed", @@ -83,7 +83,7 @@ rule intersect_autos_repma_beds: "docker://nbisweden/generode-bedtools-2.29.2" shell: """ - bedtools intersect -a {input.no_rep_bed_dir} -b {input.autosome_bed} > {output.repma_autos} 2> {log} + bedtools intersect -a {input.no_rep_bed} -b {input.autosome_bed} > {output.repma_autos} 2> {log} """ diff --git a/workflow/rules/8.2_vcf_qual_repeat_filtering.smk b/workflow/rules/8.2_vcf_qual_repeat_filtering.smk index 805d76e..d0304e5 100644 --- a/workflow/rules/8.2_vcf_qual_repeat_filtering.smk +++ b/workflow/rules/8.2_vcf_qual_repeat_filtering.smk @@ -359,7 +359,7 @@ rule remove_repeats_vcf: """Remove repeats from vcf files""" input: vcf=rules.filtered_bcf2vcf.output.vcf, - bed=rules.make_no_repeats_bed.output.no_rep_bed_dir, + bed=rules.make_no_repeats_bed.output.no_rep_bed, genomefile=rules.genome_file.output.genomefile, output: filtered=temp("results/{dataset}/vcf/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.vcf.gz"), From d2b864931c5ecf8b64d5eedc53e370263523d884 Mon Sep 17 00:00:00 2001 From: verku Date: Wed, 25 Oct 2023 11:23:46 +0200 Subject: [PATCH 44/49] Skipping extracting modern or historical samples from merged and filtered VCF file when the pipeline is only run for modern or historical samples, respectively, avoiding generation and storage of duplicate VCF files --- workflow/rules/9_merge_vcfs.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/rules/9_merge_vcfs.smk b/workflow/rules/9_merge_vcfs.smk index be46e0a..577263a 100644 --- a/workflow/rules/9_merge_vcfs.smk +++ b/workflow/rules/9_merge_vcfs.smk @@ -6,9 +6,9 @@ if os.path.exists(config["historical_samples"]) and os.path.exists(config["moder all_outputs.append(expand("results/{dataset}/vcf/" + REF_NAME + "/stats/vcf_merged_missing/multiqc/multiqc_report.html", dataset=["all", "historical", "modern"],)) elif os.path.exists(config["historical_samples"]): - all_outputs.append("results/historical/vcf/" + REF_NAME + "/stats/vcf_merged_missing/multiqc/multiqc_report.html") + all_outputs.append("results/all/vcf/" + REF_NAME + "/stats/vcf_merged_missing/multiqc/multiqc_report.html") elif os.path.exists(config["modern_samples"]): - all_outputs.append("results/modern/vcf/" + REF_NAME + "/stats/vcf_merged_missing/multiqc/multiqc_report.html") + all_outputs.append("results/all/vcf/" + REF_NAME + "/stats/vcf_merged_missing/multiqc/multiqc_report.html") # Functions used by rules of this part of the pipeline From 3891857217cb22bdc290a223eeb2081ffea6241c Mon Sep 17 00:00:00 2001 From: verku Date: Wed, 25 Oct 2023 11:28:11 +0200 Subject: [PATCH 45/49] Update function to collect the correct input files for multiqc so that extraction of historical or modern samples is avoided when running the pipeline for only historical or only modern samples --- workflow/rules/9_merge_vcfs.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/rules/9_merge_vcfs.smk b/workflow/rules/9_merge_vcfs.smk index 577263a..8598d45 100644 --- a/workflow/rules/9_merge_vcfs.smk +++ b/workflow/rules/9_merge_vcfs.smk @@ -163,11 +163,11 @@ def missingness_filtered_vcf_multiqc_inputs(wildcards): fmiss=config["f_missing"], chr=CHR,) elif os.path.exists(config["historical_samples"]): - return expand("results/historical/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".historical.merged.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + return expand("results/all/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", fmiss=config["f_missing"], chr=CHR,) elif os.path.exists(config["modern_samples"]): - return expand("results/modern/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".modern.merged.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + return expand("results/all/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", fmiss=config["f_missing"], chr=CHR,) From 11df9ed439c6e3539e480e338f25a57851f994ee Mon Sep 17 00:00:00 2001 From: verku Date: Tue, 31 Oct 2023 15:40:19 +0100 Subject: [PATCH 46/49] Return to solution to extract modern or historical samples even when only run for one dataset --- workflow/rules/9_merge_vcfs.smk | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/workflow/rules/9_merge_vcfs.smk b/workflow/rules/9_merge_vcfs.smk index 8598d45..0514803 100644 --- a/workflow/rules/9_merge_vcfs.smk +++ b/workflow/rules/9_merge_vcfs.smk @@ -6,9 +6,9 @@ if os.path.exists(config["historical_samples"]) and os.path.exists(config["moder all_outputs.append(expand("results/{dataset}/vcf/" + REF_NAME + "/stats/vcf_merged_missing/multiqc/multiqc_report.html", dataset=["all", "historical", "modern"],)) elif os.path.exists(config["historical_samples"]): - all_outputs.append("results/all/vcf/" + REF_NAME + "/stats/vcf_merged_missing/multiqc/multiqc_report.html") + all_outputs.append("results/historical/vcf/" + REF_NAME + "/stats/vcf_merged_missing/multiqc/multiqc_report.html") elif os.path.exists(config["modern_samples"]): - all_outputs.append("results/all/vcf/" + REF_NAME + "/stats/vcf_merged_missing/multiqc/multiqc_report.html") + all_outputs.append("results/modern/vcf/" + REF_NAME + "/stats/vcf_merged_missing/multiqc/multiqc_report.html") # Functions used by rules of this part of the pipeline @@ -163,11 +163,11 @@ def missingness_filtered_vcf_multiqc_inputs(wildcards): fmiss=config["f_missing"], chr=CHR,) elif os.path.exists(config["historical_samples"]): - return expand("results/all/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + return expand("results/historical/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".historical.merged.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", fmiss=config["f_missing"], chr=CHR,) elif os.path.exists(config["modern_samples"]): - return expand("results/all/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".all.merged.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", + return expand("results/modern/vcf/" + REF_NAME + "/stats/vcf_merged_missing/" + REF_NAME + ".modern.merged.biallelic.fmissing{fmiss}.{chr}.vcf.stats.txt", fmiss=config["f_missing"], chr=CHR,) @@ -575,7 +575,7 @@ rule extract_modern_samples: shell: """ samples_edited=`echo {params.samples} | sed 's/ /,/g'` - samples_len=`echo {params.samples} | wc -w` # count the number of historical samples + samples_len=`echo {params.samples} | wc -w` # count the number of modern samples all_samples_len=`echo {params.all_samples} | wc -w` # count the number of all samples if [ $samples_len != $all_samples_len ] From 329807f0153749eea5718671374befa905d9ef00 Mon Sep 17 00:00:00 2001 From: verku Date: Tue, 14 Nov 2023 10:05:01 +0100 Subject: [PATCH 47/49] Add Xmx option to use allocated memory more efficiently --- workflow/rules/3.1_bam_rmdup_realign_indels.smk | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/workflow/rules/3.1_bam_rmdup_realign_indels.smk b/workflow/rules/3.1_bam_rmdup_realign_indels.smk index a876557..cbeaee6 100644 --- a/workflow/rules/3.1_bam_rmdup_realign_indels.smk +++ b/workflow/rules/3.1_bam_rmdup_realign_indels.smk @@ -516,7 +516,8 @@ rule indel_realigner_targets: "docker://broadinstitute/gatk3:3.7-0" shell: """ - java -jar /usr/GenomeAnalysisTK.jar -T RealignerTargetCreator -R {input.ref} -I {input.bam} -o {output.target_list} -nt {threads} 2> {log} + mem=$(((6 * {threads}) - 2)) + java -jar -Xmx${{mem}}g /usr/GenomeAnalysisTK.jar -T RealignerTargetCreator -R {input.ref} -I {input.bam} -o {output.target_list} -nt {threads} 2> {log} """ @@ -538,7 +539,8 @@ rule indel_realigner: "docker://broadinstitute/gatk3:3.7-0" shell: """ - java -jar /usr/GenomeAnalysisTK.jar -T IndelRealigner -R {input.ref} -I {input.bam} -targetIntervals {input.target_list} -o {output.realigned} 2> {log} + mem=$(((6 * {threads}) - 2)) + java -jar -Xmx${{mem}}g /usr/GenomeAnalysisTK.jar -T IndelRealigner -R {input.ref} -I {input.bam} -targetIntervals {input.target_list} -o {output.realigned} 2> {log} """ From 95b6882094a15ca0208bff3038237e93008c5aae Mon Sep 17 00:00:00 2001 From: verku Date: Wed, 17 Jan 2024 12:54:13 +0100 Subject: [PATCH 48/49] Remove flag -a that creates *.fasta.align file in RepeatMasker since this file is not required --- workflow/rules/0.2_repeat_identification.smk | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/workflow/rules/0.2_repeat_identification.smk b/workflow/rules/0.2_repeat_identification.smk index 010cc00..01153ec 100644 --- a/workflow/rules/0.2_repeat_identification.smk +++ b/workflow/rules/0.2_repeat_identification.smk @@ -67,7 +67,6 @@ rule repeatmasker: repmo=rules.repeatmodeler.output.repmo, output: rep_masked=REF_DIR + "/repeatmasker/" + REF_NAME + "/" + REF_NAME + ".upper.fasta.masked", - rep_align=REF_DIR + "/repeatmasker/" + REF_NAME + "/" + REF_NAME + ".upper.fasta.align", rep_tbl=REF_DIR + "/repeatmasker/" + REF_NAME + "/" + REF_NAME + ".upper.fasta.tbl", rep_out=REF_DIR + "/repeatmasker/" + REF_NAME + "/" + REF_NAME + ".upper.fasta.out", rep_cat=REF_DIR + "/repeatmasker/" + REF_NAME + "/" + REF_NAME + ".upper.fasta.cat.gz", @@ -84,7 +83,7 @@ rule repeatmasker: shell: """ cd {params.dir} && - RepeatMasker -pa {threads} -a -xsmall -gccalc -dir ./ -lib {params.repmo} {params.ref_upper} 2> {log} && + RepeatMasker -pa {threads} -xsmall -gccalc -dir ./ -lib {params.repmo} {params.ref_upper} 2> {log} && # Check if *.cat file is compressed or uncompressed if [ ! -f {output.rep_cat} ] From 8c06ec9e0da993b27aac1f4f98788a117ff26095 Mon Sep 17 00:00:00 2001 From: verku Date: Wed, 17 Jan 2024 13:16:53 +0100 Subject: [PATCH 49/49] Remove rule that has been removed from workflow --- config/slurm/cluster.yaml | 3 --- config/slurm/profile/config.yaml | 3 --- 2 files changed, 6 deletions(-) diff --git a/config/slurm/cluster.yaml b/config/slurm/cluster.yaml index 509cb1e..4ef96f5 100644 --- a/config/slurm/cluster.yaml +++ b/config/slurm/cluster.yaml @@ -9,9 +9,6 @@ __default__: repeatmodeler: time: 06-00:00:00 cpus-per-task: 16 -repeatclassifier: - time: 01-00:00:00 - cpus-per-task: 2 repeatmasker: time: 10-00:00:00 cpus-per-task: 16 diff --git a/config/slurm/profile/config.yaml b/config/slurm/profile/config.yaml index 26eddf9..e1a09cd 100644 --- a/config/slurm/profile/config.yaml +++ b/config/slurm/profile/config.yaml @@ -21,7 +21,6 @@ default-resources: set-threads: ### repeat identification - repeatmodeler=16 - - repeatclassifier=2 - repeatmasker=16 ### fastq processing - historical_fastq_before_group=2 @@ -113,8 +112,6 @@ set-resources: ### repeat identification - repeatmodeler:runtime=8640 - repeatmodeler:mem_mb=102400 - - repeatclassifier:runtime=1440 - - repeatclassifier:mem_mb=12800 - repeatmasker:runtime=8640 - repeatmasker:mem_mb=102400 ### fastq processing