Merge pull request #65 from vrmarcelino/dev

Merge branch `dev` into `master` - `v1.5.0`
vrmarcelino · Apr 8, 2024 · ca5af58 · ca5af58
2 parents 956ef28 + 6123fbc
commit ca5af58
Show file tree

Hide file tree

Showing 9 changed files with 185 additions and 183 deletions.
diff --git a/CCMetagen.py b/CCMetagen.py
@@ -11,17 +11,14 @@
 # imports
 import sys
 import pandas as pd
-from argparse import ArgumentParser
+import argparse
 import subprocess
 import os
 from ete3 import NCBITaxa
 import re
 
 # local imports
-from ccmetagen import __version__
-from ccmetagen import fParseKMA
-from ccmetagen import cTaxInfo  # needed in fParseKMA
-from ccmetagen import fNCBItax  # needed in fParseKMA
+from ccmetagen import __version__, _ARGPARSE_DEFAULTS, fParseKMA
 
 # version formatting
 version_numb = "v" + __version__
@@ -54,17 +51,21 @@
         print("")
         sys.exit()
 
-    parser = ArgumentParser()
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
 
     parser.add_argument(
         "-m",
         "--mode",
-        default="both",
-        help="""what do you want CCMetagen to do?
-                        Valid options are 'visual', 'text' or 'both':
-                            text: parses kma, filters based on quality and output a text file with taxonomic information and detailed mapping information
-                            visual: parses kma, filters based on quality and output a simplified text file and a krona html file for visualization
-                            both: outputs both text and visual file formats. Default = both""",
+        default=_ARGPARSE_DEFAULTS.get("mode"),
+        help="""
+        What would you like CCMetagen to do?\n
+        Valid options are 'visual', 'text' or 'both':\n
+            \ttext: parses kma, filters based on quality and output a text file with taxonomic information and detailed mapping information.\n
+            \tvisual: parses kma, filters based on quality and output a simplified text file and a krona html file for visualization.\n
+            \tboth: outputs both text and visual file formats.
+            """,
         required=False,
     )
 
@@ -74,54 +75,55 @@
     parser.add_argument(
         "-o",
         "--output_fp",
-        default="CCMetagen_out",
-        help="Path to the output file. Default = CCMetagen_out",
+        default=_ARGPARSE_DEFAULTS.get("output_fp"),
+        help="Path to the output file.",
         required=False,
     )
     parser.add_argument(
         "-r",
         "--reference_database",
-        default="nt",
-        help="Which reference database was used. Options: UNITE, RefSeq or nt. Default = nt",
+        default=_ARGPARSE_DEFAULTS.get("reference_database"),
+        help="Which reference database was used. Options: UNITE, RefSeq or nt.",
         required=False,
     )
     parser.add_argument(
         "-ef",
         "--extended_output_file",
-        default="n",
+        default=_ARGPARSE_DEFAULTS.get("extended_output_file"),
         help="""Produce an extended output file that includes the percentage of classified reads.
                         Options: y or n. To use this featire, you need to generate the mapstat file when
                         required unning KMA (use flag -ef), and use it as input in CCMetagen (flag --mapstat). 
-                        Default = n""",
+                        """,
         required=False,
     )
 
     parser.add_argument(
         "-du",
         "--depth_unit",
-        default="kma",
+        default=_ARGPARSE_DEFAULTS.get("depth_unit"),
         help="""Desired unit for Depth(abundance) measurements.
                         Default = kma (KMA default depth, which is the number of nucleotides overlapping each template,
                         divided by the lengh of the template).
                         Alternatively, you can have abundance calculated in Reads Per Million (RPM, option 'rpm'), 
                         in number of nucleotides overlaping the template (option 'nc') or in number of fragments (i.e. PE reads, option 'fr').
                         If you use the 'nc', 'rpm' or 'fr' options, remember to change the default --depth parameter accordingly.
-                        Valid options are nc, rpm, fr and kma""",
+                        Valid options are 'nc', 'rpm', 'fr' and 'kma'.""",
         required=False,
     )
     parser.add_argument(
         "-map",
         "--mapstat",
+        default=_ARGPARSE_DEFAULTS.get("mapstat"),
         help="""Path to the mapstat file produced with KMA when using the -ef flag (.mapstat).
                         Required when calculating abundances in RPM or in number of fragments, visualing the abundances in readCounts or readContAln
-                        for the krona graph or when producing the extended_output_file""",
+                        for the krona graph or when producing the extended_output_file.""",
         required=False,
     )
     parser.add_argument(
         "-d",
         "--depth",
-        default=0.2,
-        help="""Minimum sequencing depth. Default = 0.2. The unit corresponds to the one used with --depth_unit
+        default=_ARGPARSE_DEFAULTS.get("depth"),
+        help="""Minimum sequencing depth. The unit corresponds to the one used with --depth_unit
                         If you use --depth_unit different from the default, change this accordingly.
                         """,
         type=float,
@@ -130,99 +132,99 @@
     parser.add_argument(
         "-c",
         "--coverage",
-        default=20,
-        help="""Minimum coverage. Default = 20 (i.e. 20%% of the reference sequence)""",
+        default=_ARGPARSE_DEFAULTS.get("coverage"),
+        help="""Percetange of minimum coverage, e.g. 20 means 20%% of the reference sequence.""",
         type=float,
         required=False,
     )
     parser.add_argument(
         "-q",
         "--query_identity",
-        default=50,
-        help="Minimum query identity (Phylum level). Default = 50",
+        default=_ARGPARSE_DEFAULTS.get("query_identity"),
+        help="Minimum query identity (Phylum level).",
         type=float,
         required=False,
     )
     parser.add_argument(
         "-p",
         "--pvalue",
-        default=0.05,
-        help="Minimum p-value. Default = 0.05.",
+        default=_ARGPARSE_DEFAULTS.get("pvalue"),
+        help="Minimum p-value.",
         type=float,
         required=False,
     )
     parser.add_argument(
         "-k",
         "--krona_mode",
-        default="Depth",
-        help="""Abundance measure for the Krona graph. Default = Depth. 
-                        Alternatively, you can choose readCount ('rc') or readCountAln ('rca').
+        default=_ARGPARSE_DEFAULTS.get("krona_mode"),
+        help="""Abundance measure for the Krona graph. 
+                        You can choose depth ('Depth'), readCount ('rc') or readCountAln ('rca').
                         'rc' and 'rca' require a specified --mapstat file.
                         """,
         required=False,
     )
     parser.add_argument(
         "-tax",
         "--local_taxfile",
-        default=None,
-        help="Use if a local taxdump file wants to be used. Default = None",
+        default=_ARGPARSE_DEFAULTS.get("local_taxfile"),
+        help="Use if a local taxdump file wants to be used.",
         required=False,
     )
 
     # similarity thresholds:
     parser.add_argument(
         "-st",
         "--species_threshold",
-        default=98.41,
-        help="Species-level similarity threshold. Default = 98.41",
+        default=_ARGPARSE_DEFAULTS.get("species_threshold"),
+        help="Species-level similarity threshold.",
         type=float,
         required=False,
     )
     parser.add_argument(
         "-gt",
         "--genus_threshold",
-        default=96.31,
-        help="Genus-level similarity threshold. Default = 96.31",
+        default=_ARGPARSE_DEFAULTS.get("genus_threshold"),
+        help="Genus-level similarity threshold.",
         type=float,
         required=False,
     )
     parser.add_argument(
         "-ft",
         "--family_threshold",
-        default=88.51,
-        help="Family-level similarity threshold. Default = 88.51",
+        default=_ARGPARSE_DEFAULTS.get("family_threshold"),
+        help="Family-level similarity threshold.",
         type=float,
         required=False,
     )
     parser.add_argument(
         "-ot",
         "--order_threshold",
-        default=81.21,
-        help="Order-level similarity threshold. Default = 81.21",
+        default=_ARGPARSE_DEFAULTS.get("order_threshold"),
+        help="Order-level similarity threshold.",
         type=float,
         required=False,
     )
     parser.add_argument(
         "-ct",
         "--class_threshold",
-        default=80.91,
-        help="Class-level similarity threshold. Default = 80.91",
+        default=_ARGPARSE_DEFAULTS.get("class_threshold"),
+        help="Class-level similarity threshold.",
         type=float,
         required=False,
     )
     parser.add_argument(
         "-pt",
         "--phylum_threshold",
-        default=0,
-        help="Phylum-level similarity threshold. Default = 0 - not applied",
+        default=_ARGPARSE_DEFAULTS.get("phylum_threshold"),
+        help="Phylum-level similarity threshold. 0 means not applied.",
         type=float,
         required=False,
     )
     parser.add_argument(
         "-off",
         "--turn_off_sim_thresholds",
-        default="n",
-        help="Turns simularity-based filtering off. Options = y or n. Default = n",
+        default=_ARGPARSE_DEFAULTS.get("turn_off_sim_thresholds"),
+        help="Turns simularity-based filtering off. Options = 'y' or 'n'.",
         required=False,
     )
 
@@ -387,7 +389,7 @@
     ##### Quality control + taxonomic assignments
 
     # quality filter (coverage, query identity, Depth and p-value)
-    df = fParseKMA.res_filter(df, ref_database, c, q, d, p)
+    df = fParseKMA.res_filter(df, c, q, d, p)
 
     # add tax info
     df = fParseKMA.populate_w_tax(df, ref_database, st, gt, ft, ot, ct, pt)

diff --git a/ccmetagen/__init__.py b/ccmetagen/__init__.py
@@ -1 +1,29 @@
-__version__ = "1.4.3"
+__version__ = "1.5.0"
+
+# This can be configured
+_RANK_THRESHOLDS = {
+    "species_threshold": 98.41, # Yeast - Vu et al 2016
+    "genus_threshold": 96.31,   # Yeast - Vu et al 2016
+    "family_threshold": 88.51,  # Filamentous fungi - Vu et al 2019
+    "order_threshold": 81.21,   # Filamentous fungi - Vu et al 2019
+    "class_threshold": 80.91,   # Filamentous fungi - Vu et al 2019
+    "phylum_threshold": 0,      # no data, no filtering    
+}
+_ARGPARSE_DEFAULTS = {
+    "mode": "both",
+    "output_fp": "CCMetagen_out",
+    "reference_database": "nt",
+    "extended_output_file": "n",
+    "depth_unit": "kma",
+    "mapstat": None,
+    "depth": 0.2,
+    "coverage": 20,
+    "query_identity": 50,
+    "pvalue": 0.05,
+    "krona_mode": "Depth",
+    "local_taxfile": None,
+    "turn_off_sim_thresholds": "n",
+    **_RANK_THRESHOLDS,
+}
+
+__all__ = ["__version__", "_ARGPARSE_DEFAULTS", "_RANK_THRESHOLDS"]
diff --git a/ccmetagen/cTaxInfo.py b/ccmetagen/cTaxInfo.py
@@ -55,8 +55,8 @@ def __init__(
         self.Class_TaxId = Class_TaxId
         self.Order = Order
         self.Order_TaxId = Order_TaxId
-        self.Family = Order
-        self.Family_TaxId = Order_TaxId
+        self.Family = Family
+        self.Family_TaxId = Family_TaxId
         self.Genus = Genus
         self.Genus_TaxId = Genus_TaxId
         self.Species = Species

diff --git a/ccmetagen/fNCBItax.py b/ccmetagen/fNCBItax.py
@@ -16,14 +16,14 @@
 
 def lineage_extractor(query_taxid, TaxInfo_object, taxfile=None):
     list_of_taxa_ranks = [
-        "superkingdom",
-        "kingdom",
-        "phylum",
-        "class",
-        "order",
-        "family",
-        "genus",
-        "species",
+        "Superkingdom",
+        "Kingdom",
+        "Phylum",
+        "Class",
+        "Order",
+        "Family",
+        "Genus",
+        "Species",
     ]
     if taxfile is not None:
         ncbi = NCBITaxa(taxfile)
@@ -34,63 +34,16 @@ def lineage_extractor(query_taxid, TaxInfo_object, taxfile=None):
     names = ncbi.get_taxid_translator(lineage)
 
     # get known data
-
     for key, val in ranks.items():
-        if val == list_of_taxa_ranks[0]:
-            TaxInfo_object.Superkingdom = names[key]
-            TaxInfo_object.Superkingdom_TaxId = key
-
-        elif val == list_of_taxa_ranks[1]:
-            TaxInfo_object.Kingdom = names[key]
-            TaxInfo_object.Kingdom_TaxId = key
-
-        elif val == list_of_taxa_ranks[2]:
-            TaxInfo_object.Phylum = names[key]
-            TaxInfo_object.Phylum_TaxId = key
-
-        elif val == list_of_taxa_ranks[3]:
-            TaxInfo_object.Class = names[key]
-            TaxInfo_object.Class_TaxId = key
-
-        elif val == list_of_taxa_ranks[4]:
-            TaxInfo_object.Order = names[key]
-            TaxInfo_object.Order_TaxId = key
-
-        elif val == list_of_taxa_ranks[5]:
-            TaxInfo_object.Family = names[key]
-            TaxInfo_object.Family_TaxId = key
-
-        elif val == list_of_taxa_ranks[6]:
-            TaxInfo_object.Genus = names[key]
-            TaxInfo_object.Genus_TaxId = key
-
-        elif val == list_of_taxa_ranks[7]:
-            TaxInfo_object.Species = names[key]
-            TaxInfo_object.Species_TaxId = key
+        for rank in list_of_taxa_ranks:
+            if val.lower() == rank.lower():
+                setattr(TaxInfo_object, rank, names[key])
+                setattr(TaxInfo_object, rank + "_TaxId", key)
 
     # fill in the blanks
-    if TaxInfo_object.Superkingdom is None:
-        TaxInfo_object.Superkingdom = "unk_sk"
-
-    if TaxInfo_object.Kingdom is None:
-        TaxInfo_object.Kingdom = "unk_k"
-
-    if TaxInfo_object.Phylum is None:
-        TaxInfo_object.Phylum = "unk_p"
-
-    if TaxInfo_object.Class is None:
-        TaxInfo_object.Class = "unk_c"
-
-    if TaxInfo_object.Order is None:
-        TaxInfo_object.Order = "unk_o"
-
-    if TaxInfo_object.Family is None:
-        TaxInfo_object.Family = "unk_f"
-
-    if TaxInfo_object.Genus is None:
-        TaxInfo_object.Genus = "unk_g"
-
-    if TaxInfo_object.Species is None:
-        TaxInfo_object.Species = "unk_s"
+    for attr in list_of_taxa_ranks:
+        if getattr(TaxInfo_object, attr) is None:
+            initial = attr.lower()[0] if attr != "Superkingdom" else "sk"
+            setattr(TaxInfo_object, attr, "unk_" + initial)
 
     return TaxInfo_object