Skip to content

Commit

Permalink
Merge pull request #65 from vrmarcelino/dev
Browse files Browse the repository at this point in the history
Merge branch `dev` into `master` - `v1.5.0`
  • Loading branch information
vrmarcelino authored Apr 8, 2024
2 parents 956ef28 + 6123fbc commit ca5af58
Show file tree
Hide file tree
Showing 9 changed files with 185 additions and 183 deletions.
100 changes: 51 additions & 49 deletions CCMetagen.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,14 @@
# imports
import sys
import pandas as pd
from argparse import ArgumentParser
import argparse
import subprocess
import os
from ete3 import NCBITaxa
import re

# local imports
from ccmetagen import __version__
from ccmetagen import fParseKMA
from ccmetagen import cTaxInfo # needed in fParseKMA
from ccmetagen import fNCBItax # needed in fParseKMA
from ccmetagen import __version__, _ARGPARSE_DEFAULTS, fParseKMA

# version formatting
version_numb = "v" + __version__
Expand Down Expand Up @@ -54,17 +51,21 @@
print("")
sys.exit()

parser = ArgumentParser()
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)

parser.add_argument(
"-m",
"--mode",
default="both",
help="""what do you want CCMetagen to do?
Valid options are 'visual', 'text' or 'both':
text: parses kma, filters based on quality and output a text file with taxonomic information and detailed mapping information
visual: parses kma, filters based on quality and output a simplified text file and a krona html file for visualization
both: outputs both text and visual file formats. Default = both""",
default=_ARGPARSE_DEFAULTS.get("mode"),
help="""
What would you like CCMetagen to do?\n
Valid options are 'visual', 'text' or 'both':\n
\ttext: parses kma, filters based on quality and output a text file with taxonomic information and detailed mapping information.\n
\tvisual: parses kma, filters based on quality and output a simplified text file and a krona html file for visualization.\n
\tboth: outputs both text and visual file formats.
""",
required=False,
)

Expand All @@ -74,54 +75,55 @@
parser.add_argument(
"-o",
"--output_fp",
default="CCMetagen_out",
help="Path to the output file. Default = CCMetagen_out",
default=_ARGPARSE_DEFAULTS.get("output_fp"),
help="Path to the output file.",
required=False,
)
parser.add_argument(
"-r",
"--reference_database",
default="nt",
help="Which reference database was used. Options: UNITE, RefSeq or nt. Default = nt",
default=_ARGPARSE_DEFAULTS.get("reference_database"),
help="Which reference database was used. Options: UNITE, RefSeq or nt.",
required=False,
)
parser.add_argument(
"-ef",
"--extended_output_file",
default="n",
default=_ARGPARSE_DEFAULTS.get("extended_output_file"),
help="""Produce an extended output file that includes the percentage of classified reads.
Options: y or n. To use this featire, you need to generate the mapstat file when
required unning KMA (use flag -ef), and use it as input in CCMetagen (flag --mapstat).
Default = n""",
""",
required=False,
)

parser.add_argument(
"-du",
"--depth_unit",
default="kma",
default=_ARGPARSE_DEFAULTS.get("depth_unit"),
help="""Desired unit for Depth(abundance) measurements.
Default = kma (KMA default depth, which is the number of nucleotides overlapping each template,
divided by the lengh of the template).
Alternatively, you can have abundance calculated in Reads Per Million (RPM, option 'rpm'),
in number of nucleotides overlaping the template (option 'nc') or in number of fragments (i.e. PE reads, option 'fr').
If you use the 'nc', 'rpm' or 'fr' options, remember to change the default --depth parameter accordingly.
Valid options are nc, rpm, fr and kma""",
Valid options are 'nc', 'rpm', 'fr' and 'kma'.""",
required=False,
)
parser.add_argument(
"-map",
"--mapstat",
default=_ARGPARSE_DEFAULTS.get("mapstat"),
help="""Path to the mapstat file produced with KMA when using the -ef flag (.mapstat).
Required when calculating abundances in RPM or in number of fragments, visualing the abundances in readCounts or readContAln
for the krona graph or when producing the extended_output_file""",
for the krona graph or when producing the extended_output_file.""",
required=False,
)
parser.add_argument(
"-d",
"--depth",
default=0.2,
help="""Minimum sequencing depth. Default = 0.2. The unit corresponds to the one used with --depth_unit
default=_ARGPARSE_DEFAULTS.get("depth"),
help="""Minimum sequencing depth. The unit corresponds to the one used with --depth_unit
If you use --depth_unit different from the default, change this accordingly.
""",
type=float,
Expand All @@ -130,99 +132,99 @@
parser.add_argument(
"-c",
"--coverage",
default=20,
help="""Minimum coverage. Default = 20 (i.e. 20%% of the reference sequence)""",
default=_ARGPARSE_DEFAULTS.get("coverage"),
help="""Percetange of minimum coverage, e.g. 20 means 20%% of the reference sequence.""",
type=float,
required=False,
)
parser.add_argument(
"-q",
"--query_identity",
default=50,
help="Minimum query identity (Phylum level). Default = 50",
default=_ARGPARSE_DEFAULTS.get("query_identity"),
help="Minimum query identity (Phylum level).",
type=float,
required=False,
)
parser.add_argument(
"-p",
"--pvalue",
default=0.05,
help="Minimum p-value. Default = 0.05.",
default=_ARGPARSE_DEFAULTS.get("pvalue"),
help="Minimum p-value.",
type=float,
required=False,
)
parser.add_argument(
"-k",
"--krona_mode",
default="Depth",
help="""Abundance measure for the Krona graph. Default = Depth.
Alternatively, you can choose readCount ('rc') or readCountAln ('rca').
default=_ARGPARSE_DEFAULTS.get("krona_mode"),
help="""Abundance measure for the Krona graph.
You can choose depth ('Depth'), readCount ('rc') or readCountAln ('rca').
'rc' and 'rca' require a specified --mapstat file.
""",
required=False,
)
parser.add_argument(
"-tax",
"--local_taxfile",
default=None,
help="Use if a local taxdump file wants to be used. Default = None",
default=_ARGPARSE_DEFAULTS.get("local_taxfile"),
help="Use if a local taxdump file wants to be used.",
required=False,
)

# similarity thresholds:
parser.add_argument(
"-st",
"--species_threshold",
default=98.41,
help="Species-level similarity threshold. Default = 98.41",
default=_ARGPARSE_DEFAULTS.get("species_threshold"),
help="Species-level similarity threshold.",
type=float,
required=False,
)
parser.add_argument(
"-gt",
"--genus_threshold",
default=96.31,
help="Genus-level similarity threshold. Default = 96.31",
default=_ARGPARSE_DEFAULTS.get("genus_threshold"),
help="Genus-level similarity threshold.",
type=float,
required=False,
)
parser.add_argument(
"-ft",
"--family_threshold",
default=88.51,
help="Family-level similarity threshold. Default = 88.51",
default=_ARGPARSE_DEFAULTS.get("family_threshold"),
help="Family-level similarity threshold.",
type=float,
required=False,
)
parser.add_argument(
"-ot",
"--order_threshold",
default=81.21,
help="Order-level similarity threshold. Default = 81.21",
default=_ARGPARSE_DEFAULTS.get("order_threshold"),
help="Order-level similarity threshold.",
type=float,
required=False,
)
parser.add_argument(
"-ct",
"--class_threshold",
default=80.91,
help="Class-level similarity threshold. Default = 80.91",
default=_ARGPARSE_DEFAULTS.get("class_threshold"),
help="Class-level similarity threshold.",
type=float,
required=False,
)
parser.add_argument(
"-pt",
"--phylum_threshold",
default=0,
help="Phylum-level similarity threshold. Default = 0 - not applied",
default=_ARGPARSE_DEFAULTS.get("phylum_threshold"),
help="Phylum-level similarity threshold. 0 means not applied.",
type=float,
required=False,
)
parser.add_argument(
"-off",
"--turn_off_sim_thresholds",
default="n",
help="Turns simularity-based filtering off. Options = y or n. Default = n",
default=_ARGPARSE_DEFAULTS.get("turn_off_sim_thresholds"),
help="Turns simularity-based filtering off. Options = 'y' or 'n'.",
required=False,
)

Expand Down Expand Up @@ -387,7 +389,7 @@
##### Quality control + taxonomic assignments

# quality filter (coverage, query identity, Depth and p-value)
df = fParseKMA.res_filter(df, ref_database, c, q, d, p)
df = fParseKMA.res_filter(df, c, q, d, p)

# add tax info
df = fParseKMA.populate_w_tax(df, ref_database, st, gt, ft, ot, ct, pt)
Expand Down
30 changes: 29 additions & 1 deletion ccmetagen/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,29 @@
__version__ = "1.4.3"
__version__ = "1.5.0"

# This can be configured
_RANK_THRESHOLDS = {
"species_threshold": 98.41, # Yeast - Vu et al 2016
"genus_threshold": 96.31, # Yeast - Vu et al 2016
"family_threshold": 88.51, # Filamentous fungi - Vu et al 2019
"order_threshold": 81.21, # Filamentous fungi - Vu et al 2019
"class_threshold": 80.91, # Filamentous fungi - Vu et al 2019
"phylum_threshold": 0, # no data, no filtering
}
_ARGPARSE_DEFAULTS = {
"mode": "both",
"output_fp": "CCMetagen_out",
"reference_database": "nt",
"extended_output_file": "n",
"depth_unit": "kma",
"mapstat": None,
"depth": 0.2,
"coverage": 20,
"query_identity": 50,
"pvalue": 0.05,
"krona_mode": "Depth",
"local_taxfile": None,
"turn_off_sim_thresholds": "n",
**_RANK_THRESHOLDS,
}

__all__ = ["__version__", "_ARGPARSE_DEFAULTS", "_RANK_THRESHOLDS"]
4 changes: 2 additions & 2 deletions ccmetagen/cTaxInfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ def __init__(
self.Class_TaxId = Class_TaxId
self.Order = Order
self.Order_TaxId = Order_TaxId
self.Family = Order
self.Family_TaxId = Order_TaxId
self.Family = Family
self.Family_TaxId = Family_TaxId
self.Genus = Genus
self.Genus_TaxId = Genus_TaxId
self.Species = Species
Expand Down
79 changes: 16 additions & 63 deletions ccmetagen/fNCBItax.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@

def lineage_extractor(query_taxid, TaxInfo_object, taxfile=None):
list_of_taxa_ranks = [
"superkingdom",
"kingdom",
"phylum",
"class",
"order",
"family",
"genus",
"species",
"Superkingdom",
"Kingdom",
"Phylum",
"Class",
"Order",
"Family",
"Genus",
"Species",
]
if taxfile is not None:
ncbi = NCBITaxa(taxfile)
Expand All @@ -34,63 +34,16 @@ def lineage_extractor(query_taxid, TaxInfo_object, taxfile=None):
names = ncbi.get_taxid_translator(lineage)

# get known data

for key, val in ranks.items():
if val == list_of_taxa_ranks[0]:
TaxInfo_object.Superkingdom = names[key]
TaxInfo_object.Superkingdom_TaxId = key

elif val == list_of_taxa_ranks[1]:
TaxInfo_object.Kingdom = names[key]
TaxInfo_object.Kingdom_TaxId = key

elif val == list_of_taxa_ranks[2]:
TaxInfo_object.Phylum = names[key]
TaxInfo_object.Phylum_TaxId = key

elif val == list_of_taxa_ranks[3]:
TaxInfo_object.Class = names[key]
TaxInfo_object.Class_TaxId = key

elif val == list_of_taxa_ranks[4]:
TaxInfo_object.Order = names[key]
TaxInfo_object.Order_TaxId = key

elif val == list_of_taxa_ranks[5]:
TaxInfo_object.Family = names[key]
TaxInfo_object.Family_TaxId = key

elif val == list_of_taxa_ranks[6]:
TaxInfo_object.Genus = names[key]
TaxInfo_object.Genus_TaxId = key

elif val == list_of_taxa_ranks[7]:
TaxInfo_object.Species = names[key]
TaxInfo_object.Species_TaxId = key
for rank in list_of_taxa_ranks:
if val.lower() == rank.lower():
setattr(TaxInfo_object, rank, names[key])
setattr(TaxInfo_object, rank + "_TaxId", key)

# fill in the blanks
if TaxInfo_object.Superkingdom is None:
TaxInfo_object.Superkingdom = "unk_sk"

if TaxInfo_object.Kingdom is None:
TaxInfo_object.Kingdom = "unk_k"

if TaxInfo_object.Phylum is None:
TaxInfo_object.Phylum = "unk_p"

if TaxInfo_object.Class is None:
TaxInfo_object.Class = "unk_c"

if TaxInfo_object.Order is None:
TaxInfo_object.Order = "unk_o"

if TaxInfo_object.Family is None:
TaxInfo_object.Family = "unk_f"

if TaxInfo_object.Genus is None:
TaxInfo_object.Genus = "unk_g"

if TaxInfo_object.Species is None:
TaxInfo_object.Species = "unk_s"
for attr in list_of_taxa_ranks:
if getattr(TaxInfo_object, attr) is None:
initial = attr.lower()[0] if attr != "Superkingdom" else "sk"
setattr(TaxInfo_object, attr, "unk_" + initial)

return TaxInfo_object
Loading

0 comments on commit ca5af58

Please sign in to comment.