Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sourmash wrapper #6618

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions tools/sourmash/.shed.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
name: sourmash
owner: iuc
description: "Compute and compare MinHash signatures for DNA data sets."
long_description: |
Quickly search, compare, and analyze genomic and metagenomic data sets
homepage_url: https://sourmash.readthedocs.io/en/latest/
remote_repository_url: https://github.com/sourmash-bio/sourmash/tree/latest/src/sourmash
type: unrestricted
categories:
- Metagenomics
auto_tool_repositories:
name_template: "{{ tool_id }}"
description_template: "Wrapper to sketch DNA, LCA classify and summarize : {{ tool_name }}"




106 changes: 106 additions & 0 deletions tools/sourmash/dna.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
"""create DNA signatures"""

usage = """

sourmash sketch dna data/*.fna.gz

The 'sketch dna' command reads in DNA sequences and outputs DNA
sketches.

By default, 'sketch dna' uses the parameter string 'k=31,scaled=1000,noabund'.

This creates sketches with a k-mer size of 31, a scaled factor of
1000, and no abundance tracking of k-mers. You can specify one or
more parameter strings of your own with -p, e.g. 'sourmash sketch dna
-p k=31,noabund -p k=21,scaled=100,abund'. Note that a single `-p` parameter string can contain multiple ksize values, but only a single scaled value or abundance value, e.g. -p k=21,k=31,abund

'sourmash sketch' takes input sequences in FASTA and FASTQ,
uncompressed or gz/bz2 compressed.

Please see the 'sketch' documentation for more details:
https://sourmash.readthedocs.io/en/latest/sourmash-sketch.html
"""

import sourmash
from sourmash.logging import notify, print_results, error

from sourmash import command_sketch

assert command_sketch.DEFAULTS["dna"] == "k=31,scaled=1000,noabund"


def subparser(subparsers):
subparser = subparsers.add_parser(
"dna", aliases=["rna", "nucleotide", "nt"], usage=usage
)
subparser.add_argument(
"--license",
default="CC0",
type=str,
help="signature license. Currently only CC0 is supported.",
)
subparser.add_argument(
"--check-sequence",
action="store_true",
help="complain if input sequence is invalid DNA",
)
subparser.add_argument(
"-p",
"--param-string",
default=[],
help="signature parameters to use.",
action="append",
)

subparser.add_argument("filenames", nargs="*", help="file(s) of sequences")
file_args = subparser.add_argument_group("File handling options")
file_args.add_argument(
"-f",
"--force",
action="store_true",
help="recompute signatures even if the file exists",
)
subparser.add_argument(
"--from-file", help="a text file containing a list of sequence files to load"
)
file_args.add_argument(
"-o", "--output", help="output computed signatures to this file"
)
file_args.add_argument(
"--set-name",
"--name",
"--merge",
dest="merge",
type=str,
default="",
metavar="FILE",
help="name the output sketch as specified; note, merges all input "
"files while sketching",
)
file_args.add_argument(
"--output-dir",
"--outdir",
help="output computed signatures to this directory",
)
file_args.add_argument(
"--singleton",
action="store_true",
help="compute a signature for each sequence record individually",
)
file_args.add_argument(
"--name-from-first",
action="store_true",
help="name the signature generated from each file after the first "
"record in the file",
)
file_args.add_argument(
"--randomize",
action="store_true",
help="shuffle the list of input filenames randomly",
)


def main(args):
import sourmash.command_sketch

return sourmash.command_sketch.dna(args)
23 changes: 23 additions & 0 deletions tools/sourmash/macros.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
<?xml version="1.0"?>
<macros>
<token name="@TOOL_VERSION@">4.8.11</token>
<token name="@VERSION_SUFFIX@">0</token>
<token name="@PROFILE@">23.2</token>

<xml name="xrefs">
<xrefs>
<xref type="bio.tools">sourmash</xref>
</xrefs>
</xml>
<xml name="requirements">
<requirements>
<requirement type="package" version="@TOOL_VERSION@">bioconductor-sourmash</requirement>

</requirements>
</xml>
<xml name="citations">
<citations>
<citation type="doi">10.21105/joss.00027</citation>
</citations>
</xml>
</macros>
64 changes: 64 additions & 0 deletions tools/sourmash/sourmash.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
<tool id="sourmash" name="sourmash" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
<description>Compute and compare MinHash signatures for DNA data sets</description>
<macros>
<import>macros.xml</import>
</macros>
<expand macro="xrefs"/>
<expand macro="requirements"/>

<command detect_errors="exit_code"><![CDATA[
python '${__tool_directory__}/dna.py'
--param-string '[$additional_options.k_mers,$additional_options.scaled,$additional_options.noabund]'
--from-file '$input_seq'
--output '$dna_sketch'
--output-dir 'outputFolder'
--singleton '$singleton'
--name-from-first '$name_from_first'
--randomize '$randomize'

]]></command>
<inputs>
<param name="input_seq" type="data" format="fasta,fasta.gz,fastqsanger,fastqsanger.gz" label="Input DNA Sequence"/>
<section name="additional_options" title="Additional Options" expanded="true">
<param name="k_mers" type="integer" value="31" optional="true" multiple="true" label="K-mer size" help="multiple value separated by ',', DEFAULT: 31"/>
<param name="scaled" type="integer" value="1000" optional="true" label="Scaled factor" help="DEFAULT: 1000"/>
<param name="noabund" type="select" value="noabund" optional="true" label="Abundance tracking of k-mers" help="DEFAULT: noabund">
<option value="noabund" selected="true">No abundance tracking of k-mer</option>
<option value="abund">Abundance tracking of k-mer</option>
</param>
<param argument="--singleton" type="boolean" value="true" optional="true" label="compute a signature for each sequence record individual"/>
<param argument="--name-from-first" type="boolean" value="true" label="name the signature generated from each file after the first"/>
<param argument="--randomize" type="boolean" value="true" label="shuffle the list of input filenames randomly"/>
</section>
</inputs>
<outputs>
<data name="dna_sketch" format="pdf" from_work_dir="outputFolder/output" label="${tool.name} on ${on_string}:" />


</outputs>
<tests>
<test expect_num_outputs="1">
<param name="input_seq" value="GCA_903797575.1_PARATYPHIC668_genomic (1).fna"/>
<section name="additional_options">
<param name="k_mers" value="45"/>
<param name="scaled" value="1040"/>
<param name="noabund" value="noabund"/>
<param name="singleton" value="TRUE"/>
<param name="name_from_first" value="TRUE"/>
<param name="randomize" value="TRUE"/>
</section>

<output name="dna_sketch">
<assert_contents>
<has_size value="7373" delta="1000" />
</assert_contents>
</output>
</test>
</tests>

<help><![CDATA[
Quickly search, compare, and analyze genomic and metagenomic data sets

]]></help>
<expand macro="citations"/>
</tool>
Loading
Loading