From b6cec1a17cedbc0411d33454e2147ce9f3d9a96c Mon Sep 17 00:00:00 2001 From: Art Rand Date: Sat, 21 Dec 2024 04:40:24 +0000 Subject: [PATCH] Prepare 0.4.2 release --- CHANGELOG.md | 8 + book/src/advanced_usage.md | 104 +++++++- docs/404.html | 2 +- docs/advanced_usage.html | 108 +++++++- docs/algo_details.html | 2 +- docs/collapse.html | 2 +- docs/dmr_scoring_details.html | 2 +- docs/evaluate_motif.html | 2 +- docs/faq.html | 2 +- docs/filtering.html | 2 +- docs/filtering_details.html | 2 +- docs/filtering_numeric_details.html | 2 +- docs/index.html | 8 +- docs/intro_adjust.html | 6 +- docs/intro_bedmethyl_merge.html | 251 ++++++++++++++++++ docs/intro_call_mods.html | 2 +- docs/intro_dmr.html | 37 +-- docs/intro_edge_filter.html | 2 +- docs/intro_entropy.html | 6 +- docs/intro_extract.html | 2 +- docs/intro_find_motifs.html | 4 +- docs/intro_include_bed.html | 6 +- docs/intro_localize.html | 2 +- docs/intro_motif.html | 2 +- docs/intro_motif_bed.html | 2 +- ...intro_bedmethyl.html => intro_pileup.html} | 2 +- docs/intro_pileup_hemi.html | 6 +- docs/intro_repair.html | 2 +- docs/intro_sample_probs.html | 2 +- docs/intro_stats.html | 2 +- docs/intro_summary.html | 2 +- docs/intro_validate.html | 2 +- docs/limitations.html | 2 +- docs/perf_considerations.html | 2 +- docs/print.html | 178 +++++++++++-- docs/quick_start.html | 8 +- docs/searchindex.js | 2 +- docs/searchindex.json | 2 +- docs/troubleshooting.html | 4 +- generate_advanced_usage.sh | 8 + 40 files changed, 689 insertions(+), 103 deletions(-) create mode 100644 docs/intro_bedmethyl_merge.html rename docs/{intro_bedmethyl.html => intro_pileup.html} (83%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 270da79..0a51481 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,14 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [v0.4.2] +### Adds +- [entropy] Entropy can now be calculated with multiple motifs and multiple modified primary bases. +- [adjust-mods, call-mods] Retain or remove base modification calls based on whether they match a sequence motif in the basecall sequence. +- [bedmethyl] Add command to merge bedMethyl files. +- [dmr] Add strand to DMR output. + + ## [v0.4.1] ### Adds - [docs] Fix documentation links diff --git a/book/src/advanced_usage.md b/book/src/advanced_usage.md index 1fd1d63..eda911d 100644 --- a/book/src/advanced_usage.md +++ b/book/src/advanced_usage.md @@ -64,6 +64,7 @@ Commands: localize Investigate patterns of base modifications, by aggregating pileup counts "localized" around genomic features of interest stats Calculate base modification levels over entire regions + bedmethyl Utilities to work with bedMethyl files help Print this message or the help of the given subcommand(s) Options: @@ -337,9 +338,6 @@ Arguments: one of `-` or `stdin` to specify a stream from standard output Options: - --log-filepath - Output debug logs to file at this path - --ignore Modified base code to ignore/remove, see https://samtools.github.io/hts-specs/SAMtags.pdf for details on the @@ -438,11 +436,31 @@ Options: when estimating the filter threshold (i.e. ignore soft-clipped, and inserted bases) + --motif + Filter out any base modification call that isn't part of a basecall + sequence motif. This argument can be passed multiple times. Format is + . For example the argument to match CpG + dinucleotides is `--motif CG 0`, or to match CG[5mC]G the argument + would be `--motif CGCG 2`. Single bases can be used as motifs to keep + only base modification calls for a specific primary base, for example + `--motif C 0` + + --cpg + Shorthand for --motif CG 0 + + --discard-motifs + Discard base modification calls that match the provided motifs + (instead of keeping them) + --suppress-progress Hide the progress bar -h, --help Print help (see a summary with '-h') + +Logging: + --log-filepath + Output debug logs to file at this path ``` ## update-tags @@ -851,6 +869,20 @@ Options: using this flag will keep only base modification calls in the first 4 and last 8 bases + --motif + Filter out any base modification call that isn't part of a basecall + sequence motif This argument can be passed multiple times. Format is + . For example the argument to match CpG + dinucleotides is `--motif CG 0`, or to match CG[5mC]G the argument + would be `--motif CGCG 2` + + --cpg + Shorthand for --motif CG 0 + + --discard-motifs + Discard base modification calls that match the provided motifs + (instead of keeping them) + --output-sam Output SAM format instead of BAM @@ -1263,7 +1295,10 @@ Options: Respect soft masking in the reference FASTA --motif - Motif to use for entropy calculation, default will be CpG + Motif to use for entropy calculation, multiple motifs can be used by + repeating this option. When multiple motifs are used that specify + different modified primary bases, all modification possibilities will + be used in the calculation --cpg Use CpG motifs. Short hand for --motif CG 0 --combine-strands @@ -2372,3 +2407,64 @@ Options: -h, --help Print help ``` + +## bedmethyl merge +```text +Perform an outer join on two or more bedMethyl files, summing their counts for +records that overlap + +Usage: modkit bedmethyl merge [OPTIONS] --out-bed --genome-sizes [IN_BEDMETHYL] [IN_BEDMETHYL]... + +Arguments: + [IN_BEDMETHYL] [IN_BEDMETHYL]... + Input bedMethyl table(s). Should be bgzip-compressed and have an + associated Tabix index. The tabix index will be assumed to be + $this_file.tbi + +Options: + -o, --out-bed + Specify the output file to write the results table + + -g, --genome-sizes + TSV of genome sizes, should be \t + + --force + Force overwrite the output file + + --with-header + Output a header with the bedMethyl + + --mixed-delim + Output bedMethyl where the delimiter of columns past column 10 are + space-delimited instead of tab-delimited. This option can be useful + for some browsers and parsers that don't expect the extra columns of + the bedMethyl format + + --chunk-size + Chunk size for how many start..end regions for each chromosome to + read. Larger values will lead to faster merging at the expense of + memory usage, while smaller values will be slower with lower memory + usage. This option will only impact large bedmethyl files + + -i, --interval-size + Interval chunk size in base pairs to process concurrently. Smaller + interval chunk sizes will use less memory but incur more overhead + + [default: 100000] + + --log-filepath + Specify a file to write debug logs to + + -t, --threads + Number of threads to use + + [default: 4] + + --io-threads + Number of tabix/bgzf threads to use + + [default: 2] + + -h, --help + Print help (see a summary with '-h') +``` diff --git a/docs/404.html b/docs/404.html index 153907a..6fdb929 100644 --- a/docs/404.html +++ b/docs/404.html @@ -92,7 +92,7 @@