bibliography.bib


@article{GregorPhylopythias2016,
  title = {{{{\emph{PhyloPythiaS}}}}{\emph{+}}: A Self-Training Method for the Rapid Reconstruction of Low-Ranking Taxonomic Bins from Metagenomes},
  volume = {4},
  issn = {2167-8359},
  doi = {10.7717/peerj.1603},
  abstract = {\textbf{Background.} Metagenomics is an approach for characterizing environmental microbial communities \emph{in situ}, it allows their functional and taxonomic characterization and to recover sequences from uncultured taxa. This is often achieved by a combination of sequence assembly and binning, where sequences are grouped into ‘bins’ representing taxa of the underlying microbial community. Assignment to low-ranking taxonomic bins is an important challenge for binning methods as is scalability to Gb-sized datasets generated with deep sequencing techniques. One of the best available methods for species bins recovery from deep-branching phyla is the expert-trained \emph{PhyloPythiaS} package, where a human expert decides on the taxa to incorporate in the model and identifies ‘training’ sequences based on marker genes directly from the sample. Due to the manual effort involved, this approach does not scale to multiple metagenome samples and requires substantial expertise, which researchers who are new to the area do not have. \textbf{Results.} We have developed \emph{PhyloPythiaS+}, a successor to our \emph{PhyloPythia(S)} software. The new (+) component performs the work previously done by the human expert. \emph{PhyloPythiaS+} also includes a new \emph{k}-mer counting algorithm, which accelerated the simultaneous counting of 4–6-mers used for taxonomic binning 100-fold and reduced the overall execution time of the software by a factor of three. Our software allows to analyze Gb-sized metagenomes with inexpensive hardware, and to recover species or genera-level bins with low error rates in a fully automated fashion. \emph{PhyloPythiaS+} was compared to \emph{MEGAN}, \emph{taxator-tk}, \emph{Kraken} and the generic \emph{PhyloPythiaS} model. The results showed that \emph{PhyloPythiaS+} performs especially well for samples originating from novel environments in comparison to the other methods. \textbf{Availability.} \emph{PhyloPythiaS+} in a virtual machine is available for installation under Windows, Unix systems or OS X on: https://github.com/algbioi/ppsp/wiki.},
  timestamp = {2016-06-17T09:22:44Z},
  journaltitle = {PeerJ},
  author = {Gregor, Ivan and Dröge, Johannes and Schirmer, Melanie and Quince, Christopher and McHardy, Alice C.},
  date = {2016-02},
  pages = {e1603},
  keywords = {bioinformatics,machine learning,metagenomics,Taxonomic classification}
}

@article{LuCocacola2016,
  title = {{{COCACOLA}}: Binning Metagenomic Contigs Using Sequence {{COmposition}}, Read {{CoverAge}}, {{CO}}-Alignment, and Paired-End Read {{LinkAge}}},
  shorttitle = {{{COCACOLA}}},
  timestamp = {2016-06-17T10:00:21Z},
  journaltitle = {Bioinformatics},
  author = {Lu, Yang Young and Chen, Ting and Fuhrman, Jed A. and Sun, Fengzhu},
  urldate = {2016-06-17},
  date = {2016},
  pages = {btw290}
}

@article{AlbertsenGenome2013,
  title = {Genome Sequences of Rare, Uncultured Bacteria Obtained by Differential Coverage Binning of Multiple Metagenomes.},
  volume = {31},
  issn = {1546-1696},
  doi = {10.1038/nbt.2579},
  abstract = {Reference genomes are required to understand the diverse roles of microorganisms in ecology, evolution, human and animal health, but most species remain uncultured. Here we present a sequence composition-independent approach to recover high-quality microbial genomes from deeply sequenced metagenomes. Multiple metagenomes of the same community, which differ in relative population abundances, were used to assemble 31 bacterial genomes, including rare ($\backslash$textless1\% relative abundance) species, from an activated sludge bioreactor. Twelve genomes were assembled into complete or near-complete chromosomes. Four belong to the candidate bacterial phylum TM7 and represent the most complete genomes for this phylum to date (relative abundances, 0.06-1.58\%). Reanalysis of published metagenomes reveals that differential coverage binning facilitates recovery of more complete and higher fidelity genome bins than other currently used methods, which are primarily based on sequence composition. This approach will be an important addition to the standard metagenome toolbox and greatly improve access to genomes of uncultured microorganisms.},
  timestamp = {2017-02-17T20:48:16Z},
  number = {6},
  journaltitle = {Nature biotechnology},
  author = {Albertsen, Mads and Hugenholtz, Philip and Skarshewski, Adam and Nielsen, K$\backslash$a are L and Tyson, Gene W and Nielsen, Per H},
  date = {2013-06},
  pages = {533--8},
  keywords = {Animals,Bacteria,Bacteria: classification,Bacteria: genetics,Bacterial,Base Sequence,DNA,Genome,Humans,Metagenome,Metagenomics,Molecular Sequence Data,Sequence Analysis},
  eprinttype = {pmid},
  eprint = {23707974}
}

@article{BuchfinkFast2014,
  title = {Fast and Sensitive Protein Alignment Using {{DIAMOND}}},
  volume = {12},
  issn = {1548-7091, 1548-7105},
  doi = {10.1038/nmeth.3176},
  timestamp = {2016-06-17T10:54:44Z},
  number = {1},
  journaltitle = {Nature Methods},
  author = {Buchfink, Benjamin and Xie, Chao and Huson, Daniel H},
  urldate = {2016-06-17},
  date = {2014-11-17},
  pages = {59--60}
}

@article{AmannPhylogenetic1995,
  title = {Phylogenetic Identification and in Situ Detection of Individual Microbial Cells without Cultivation.},
  volume = {59},
  issn = {0146-0749},
  abstract = {The frequent discrepancy between direct microscopic counts and numbers of culturable bacteria from environmental samples is just one of several indications that we currently know only a minor part of the diversity of microorganisms in nature. A combination of direct retrieval of rRNA sequences and whole-cell oligonucleotide probing can be used to detect specific rRNA sequences of uncultured bacteria in natural samples and to microscopically identify individual cells. Studies have been performed with microbial assemblages of various complexities ranging from simple two-component bacterial endosymbiotic associations to multispecies enrichments containing magnetotactic bacteria to highly complex marine and soil communities. Phylogenetic analysis of the retrieved rRNA sequence of an uncultured microorganism reveals its closest culturable relatives and may, together with information on the physicochemical conditions of its natural habitat, facilitate more directed cultivation attempts. For the analysis of complex communities such as multispecies biofilms and activated-sludge flocs, a different approach has proven advantageous. Sets of probes specific to different taxonomic levels are applied consecutively beginning with the more general and ending with the more specific (a hierarchical top-to-bottom approach), thereby generating increasingly precise information on the structure of the community. Not only do rRNA-targeted whole-cell hybridizations yield data on cell morphology, specific cell counts, and in situ distributions of defined phylogenetic groups, but also the strength of the hybridization signal reflects the cellular rRNA content of individual cells. From the signal strength conferred by a specific probe, in situ growth rates and activities of individual cells might be estimated for known species. In many ecosystems, low cellular rRNA content and/or limited cell permeability, combined with background fluorescence, hinders in situ identification of autochthonous populations. Approaches to circumvent these problems are discussed in detail.},
  timestamp = {2016-06-16T16:07:15Z},
  number = {1},
  journaltitle = {Microbiological reviews},
  author = {Amann, R I and Ludwig, W and Schleifer, K H},
  date = {1995-03},
  pages = {143--69},
  keywords = {16S,16S: analysis,16S: genetics,23S,23S: analysis,23S: genetics,Bacteria,Bacteria: genetics,Bacteria: isolation & purification,Bacterial,Bacterial: genetics,Bacterial: isolation & purification,Base Sequence,Genetic Variation,In Situ Hybridization,In Situ Hybridization: methods,Molecular Sequence Data,Ribosomal,RNA},
  eprinttype = {pmid},
  eprint = {7535888}
}

@article{BaranJoint2012,
  title = {Joint Analysis of Multiple Metagenomic Samples.},
  volume = {8},
  issn = {1553-7358},
  doi = {10.1371/journal.pcbi.1002373},
  abstract = {The availability of metagenomic sequencing data, generated by sequencing DNA pooled from multiple microbes living jointly, has increased sharply in the last few years with developments in sequencing technology. Characterizing the contents of metagenomic samples is a challenging task, which has been extensively attempted by both supervised and unsupervised techniques, each with its own limitations. Common to practically all the methods is the processing of single samples only; when multiple samples are sequenced, each is analyzed separately and the results are combined. In this paper we propose to perform a combined analysis of a set of samples in order to obtain a better characterization of each of the samples, and provide two applications of this principle. First, we use an unsupervised probabilistic mixture model to infer hidden components shared across metagenomic samples. We incorporate the model in a novel framework for studying association of microbial sequence elements with phenotypes, analogous to the genome-wide association studies performed on human genomes: We demonstrate that stratification may result in false discoveries of such associations, and that the components inferred by the model can be used to correct for this stratification. Second, we propose a novel read clustering (also termed "binning") algorithm which operates on multiple samples simultaneously, leveraging on the assumption that the different samples contain the same microbial species, possibly in different proportions. We show that integrating information across multiple samples yields more precise binning on each of the samples. Moreover, for both applications we demonstrate that given a fixed depth of coverage, the average per-sample performance generally increases with the number of sequenced samples as long as the per-sample coverage is high enough.},
  timestamp = {2017-02-17T20:48:00Z},
  number = {2},
  journaltitle = {PLoS computational biology},
  author = {Baran, Yael and Halperin, Eran},
  date = {2012-02},
  pages = {e1002373},
  keywords = {Crohn's disease,DNA sequencing,Genome-wide association studies,Genomic databases,Genomic medicine,Metagenomics,Principal component analysis,Sequence alignment},
  eprinttype = {pmid},
  eprint = {22359490}
}

@article{BergerPerformance2011,
  title = {Performance, Accuracy, and Web Server for Evolutionary Placement of Short Sequence Reads under Maximum Likelihood.},
  volume = {60},
  issn = {1076-836X},
  doi = {10.1093/sysbio/syr010},
  abstract = {We present an evolutionary placement algorithm (EPA) and a Web server for the rapid assignment of sequence fragments (short reads) to edges of a given phylogenetic tree under the maximum-likelihood model. The accuracy of the algorithm is evaluated on several real-world data sets and compared with placement by pair-wise sequence comparison, using edit distances and BLAST. We introduce a slow and accurate as well as a fast and less accurate placement algorithm. For the slow algorithm, we develop additional heuristic techniques that yield almost the same run times as the fast version with only a small loss of accuracy. When those additional heuristics are employed, the run time of the more accurate algorithm is comparable with that of a simple BLAST search for data sets with a high number of short query sequences. Moreover, the accuracy of the EPA is significantly higher, in particular when the sample of taxa in the reference topology is sparse or inadequate. Our algorithm, which has been integrated into RAxML, therefore provides an equally fast but more accurate alternative to BLAST for tree-based inference of the evolutionary origin and composition of short sequence reads. We are also actively developing a Web server that offers a freely available service for computing read placements on trees using the EPA.},
  timestamp = {2016-06-16T16:07:17Z},
  number = {3},
  journaltitle = {Systematic biology},
  author = {Berger, Simon A and Krompass, Denis and Stamatakis, Alexandros},
  date = {2011-05},
  pages = {291--302},
  keywords = {Algorithms,Amino Acid Sequence,Base Sequence,Computer Simulation,DNA,DNA: methods,Evolution,Internet,Likelihood Functions,Molecular,phylogenetic placement algorithm,Phylogeny,Protein,Protein: methods,RNA,RNA: methods,Sequence Alignment,Sequence Alignment: methods,Sequence Analysis,Software,taxonomic binning},
  eprinttype = {pmid},
  eprint = {21436105}
}

@article{BradyPhymmbl2011,
  title = {{{PhymmBL}} Expanded: Confidence Scores, Custom Databases, Parallelization and More.},
  volume = {8},
  issn = {1548-7105},
  doi = {10.1038/nmeth0511-367},
  timestamp = {2016-06-16T16:07:18Z},
  number = {5},
  journaltitle = {Nature methods},
  author = {Brady, Arthur and Salzberg, Steven},
  date = {2011-05},
  pages = {367},
  keywords = {Animals,Classification,Classification: methods,Databases,Elephants,Elephants: classification,Elephants: genetics,Mammoths,Mammoths: classification,Mammoths: genetics,metagenomics,Metagenomics: statistics & numerical data,Nucleic Acid,Phylogeny,Sequence Alignment,Sequence Alignment: statistics & numerical data,Software},
  eprinttype = {pmid},
  eprint = {21527926}
}

@article{BradyPhymm2009,
  title = {Phymm and {{PhymmBL}}: Metagenomic Phylogenetic Classification with Interpolated {{Markov}} Models.},
  volume = {6},
  issn = {1548-7105},
  doi = {10.1038/nmeth.1358},
  abstract = {Metagenomics projects collect DNA from uncharacterized environments that may contain thousands of species per sample. One main challenge facing metagenomic analysis is phylogenetic classification of raw sequence reads into groups representing the same or similar taxa, a prerequisite for genome assembly and for analyzing the biological diversity of a sample. New sequencing technologies have made metagenomics easier, by making sequencing faster, and more difficult, by producing shorter reads than previous technologies. Classifying sequences from reads as short as 100 base pairs has until now been relatively inaccurate, requiring researchers to use older, long-read technologies. We present Phymm, a classifier for metagenomic data, that has been trained on 539 complete, curated genomes and can accurately classify reads as short as 100 base pairs, a substantial improvement over previous composition-based classification methods. We also describe how combining Phymm with sequence alignment algorithms improves accuracy.},
  timestamp = {2016-06-16T16:07:18Z},
  number = {9},
  journaltitle = {Nature methods},
  author = {Brady, Arthur and Salzberg, Steven L},
  date = {2009-09},
  pages = {673--6},
  keywords = {Artificial Intelligence,Bacteria,Bacteria: classification,Bacteria: genetics,Base Sequence,DNA,DNA: classification,DNA: genetics,Genetic,Genomics,Genomics: methods,Hydrogen-Ion Concentration,Markov Chains,Mining,Models,Phylogeny,Sequence Alignment,Soil Microbiology},
  eprinttype = {pmid},
  eprint = {19648916}
}

@article{CamachoBlast2009,
  title = {{{BLAST}}+: Architecture and Applications.},
  volume = {10},
  issn = {1471-2105},
  doi = {10.1186/1471-2105-10-421},
  abstract = {BACKGROUND: Sequence similarity searching is a very important bioinformatics task. While Basic Local Alignment Search Tool (BLAST) outperforms exact methods through its use of heuristics, the speed of the current BLAST software is suboptimal for very long queries or database sequences. There are also some shortcomings in the user-interface of the current command-line applications. RESULTS: We describe features and improvements of rewritten BLAST software and introduce new command-line applications. Long query sequences are broken into chunks for processing, in some cases leading to dramatically shorter run times. For long database sequences, it is possible to retrieve only the relevant parts of the sequence, reducing CPU time and memory usage for searches of short queries against databases of contigs or chromosomes. The program can now retrieve masking information for database sequences from the BLAST databases. A new modular software library can now access subject sequence data from arbitrary data sources. We introduce several new features, including strategy files that allow a user to save and reuse their favorite set of options. The strategy files can be uploaded to and downloaded from the NCBI BLAST web site. CONCLUSION: The new BLAST command-line applications, compared to the current BLAST tools, demonstrate substantial speed improvements for long queries as well as chromosome length database sequences. We have also improved the user interface of the command-line applications.},
  timestamp = {2016-06-16T16:07:19Z},
  journaltitle = {BMC bioinformatics},
  author = {Camacho, Christiam and Coulouris, George and Avagyan, Vahram and Ma, Ning and Papadopoulos, Jason and Bealer, Kevin and Madden, Thomas L},
  date = {2009-01},
  pages = {421},
  keywords = {Computational Biology,Computational Biology: methods,Databases,Genetic,Sequence Alignment,Software},
  eprinttype = {pmid},
  eprint = {20003500}
}

@article{CarrReconstructing2013,
  title = {Reconstructing the Genomic Content of Microbiome Taxa through Shotgun Metagenomic Deconvolution.},
  volume = {9},
  issn = {1553-7358},
  doi = {10.1371/journal.pcbi.1003292},
  abstract = {Metagenomics has transformed our understanding of the microbial world, allowing researchers to bypass the need to isolate and culture individual taxa and to directly characterize both the taxonomic and gene compositions of environmental samples. However, associating the genes found in a metagenomic sample with the specific taxa of origin remains a critical challenge. Existing binning methods, based on nucleotide composition or alignment to reference genomes allow only a coarse-grained classification and rely heavily on the availability of sequenced genomes from closely related taxa. Here, we introduce a novel computational framework, integrating variation in gene abundances across multiple samples with taxonomic abundance data to deconvolve metagenomic samples into taxa-specific gene profiles and to reconstruct the genomic content of community members. This assembly-free method is not bounded by various factors limiting previously described methods of metagenomic binning or metagenomic assembly and represents a fundamentally different approach to metagenomic-based genome reconstruction. An implementation of this framework is available at http://elbo.gs.washington.edu/software.html. We first describe the mathematical foundations of our framework and discuss considerations for implementing its various components. We demonstrate the ability of this framework to accurately deconvolve a set of metagenomic samples and to recover the gene content of individual taxa using synthetic metagenomic samples. We specifically characterize determinants of prediction accuracy and examine the impact of annotation errors on the reconstructed genomes. We finally apply metagenomic deconvolution to samples from the Human Microbiome Project, successfully reconstructing genus-level genomic content of various microbial genera, based solely on variation in gene count. These reconstructed genera are shown to correctly capture genus-specific properties. With the accumulation of metagenomic data, this deconvolution framework provides an essential tool for characterizing microbial taxa never before seen, laying the foundation for addressing fundamental questions concerning the taxa comprising diverse microbial communities.},
  timestamp = {2016-06-16T16:07:19Z},
  number = {10},
  journaltitle = {PLoS computational biology},
  author = {Carr, Rogan and Shen-Orr, Shai S and Borenstein, Elhanan},
  date = {2013-01},
  pages = {e1003292},
  eprinttype = {pmid},
  eprint = {24146609}
}

@article{DarlingPhylosift2014,
  title = {{{PhyloSift}}: Phylogenetic Analysis of Genomes and Metagenomes.},
  volume = {2},
  issn = {2167-8359},
  doi = {10.7717/peerj.243},
  abstract = {Like all organisms on the planet, environmental microbes are subject to the forces of molecular evolution. Metagenomic sequencing provides a means to access the DNA sequence of uncultured microbes. By combining DNA sequencing of microbial communities with evolutionary modeling and phylogenetic analysis we might obtain new insights into microbiology and also provide a basis for practical tools such as forensic pathogen detection. In this work we present an approach to leverage phylogenetic analysis of metagenomic sequence data to conduct several types of analysis. First, we present a method to conduct phylogeny-driven Bayesian hypothesis tests for the presence of an organism in a sample. Second, we present a means to compare community structure across a collection of many samples and develop direct associations between the abundance of certain organisms and sample metadata. Third, we apply new tools to analyze the phylogenetic diversity of microbial communities and again demonstrate how this can be associated to sample metadata. These analyses are implemented in an open source software pipeline called PhyloSift. As a pipeline, PhyloSift incorporates several other programs including LAST, HMMER, and pplacer to automate phylogenetic analysis of protein coding and RNA sequences in metagenomic datasets generated by modern sequencing platforms (e.g., Illumina, 454).},
  timestamp = {2016-06-16T16:07:22Z},
  journaltitle = {PeerJ},
  author = {Darling, Aaron E. and Jospin, Guillaume and Lowe, Eric and a. Matsen, Frederick and Bik, Holly M. and a. Eisen, Jonathan},
  date = {2014-01},
  pages = {e243},
  keywords = {accepted 19 december 2013,bayes factor,community,forensics,metagenomics,microbial diversity,phylogenetics,published 9 january 2014,submitted 21 march 2013},
  eprinttype = {pmid},
  eprint = {24482762}
}

@article{DesantisGreengenes2006,
  title = {Greengenes, a Chimera-Checked {{16S rRNA}} Gene Database and Workbench Compatible with {{ARB}}.},
  volume = {72},
  issn = {0099-2240},
  doi = {10.1128/AEM.03006-05},
  abstract = {A 16S rRNA gene database (http://greengenes.lbl.gov) addresses limitations of public repositories by providing chimera screening, standard alignment, and taxonomic classification using multiple published taxonomies. It was found that there is incongruent taxonomic nomenclature among curators even at the phylum level. Putative chimeras were identified in 3\% of environmental sequences and in 0.2\% of records derived from isolates. Environmental sequences were classified into 100 phylum-level lineages in the Archaea and Bacteria.},
  timestamp = {2016-06-16T16:07:23Z},
  number = {7},
  journaltitle = {Applied and environmental microbiology},
  author = {DeSantis, T Z and Hugenholtz, P and Larsen, N and Rojas, M and Brodie, E L and Keller, K and Huber, T and Dalevi, D and Hu, P and Andersen, G L},
  date = {2006-07},
  pages = {5069--72},
  keywords = {16S,16S: genetics,Archaea,Archaea: classification,Archaea: genetics,Bacteria,Bacteria: classification,Bacteria: genetics,Databases,Genes,Genetic,Nucleic Acid,Nucleic Acid: standards,Polymerase Chain Reaction,Recombination,Reproducibility of Results,Ribosomal,RNA,rRNA,Sequence Alignment,Software},
  eprinttype = {pmid},
  eprint = {16820507}
}

@article{DeschavanneGenomic1999,
  title = {Genomic Signature: Characterization and Classification of Species Assessed by Chaos Game Representation of Sequences.},
  volume = {16},
  issn = {0737-4038},
  abstract = {We explored DNA structures of genomes by means of a new tool derived from the "chaotic dynamical systems" theory (the so-called chaos game representation [CGR]), which allows the depiction of frequencies of oligonucleotides in the form of images. Using CGR, we observe that subsequences of a genome exhibit the main characteristics of the whole genome, attesting to the validity of the genomic signature concept. Base concentrations, stretches (runs of complementary bases or purines/pyrimidines), and patches (over- or underexpressed words of various lengths) are the main factors explaining the variability observed among sequences. The distance between images may be considered a measure of phylogenetic proximity. Eukaryotes and prokaryotes can be identified merely on the basis of their DNA structures.},
  timestamp = {2016-06-16T16:07:23Z},
  number = {10},
  journaltitle = {Molecular biology and evolution},
  author = {Deschavanne, P J and Giron, a and Vilain, J and Fagot, G and Fertil, B},
  date = {1999-10},
  pages = {1391--9},
  keywords = {Algorithms,Animals,Classification,Computer-Assisted,Computer Simulation,DNA,DNA: analysis,DNA: genetics,Evolution,Genome,Humans,Image Processing,Molecular,Phylogeny,Species Specificity},
  eprinttype = {pmid},
  eprint = {10563018}
}

@article{DiazTacoa2009,
  title = {{{TACOA}}: Taxonomic Classification of Environmental Genomic Fragments Using a Kernelized Nearest Neighbor Approach.},
  volume = {10},
  issn = {1471-2105},
  doi = {10.1186/1471-2105-10-56},
  abstract = {BACKGROUND: Metagenomics, or the sequencing and analysis of collective genomes (metagenomes) of microorganisms isolated from an environment, promises direct access to the "unculturable majority". This emerging field offers the potential to lay solid basis on our understanding of the entire living world. However, the taxonomic classification is an essential task in the analysis of metagenomics data sets that it is still far from being solved. We present a novel strategy to predict the taxonomic origin of environmental genomic fragments. The proposed classifier combines the idea of the k-nearest neighbor with strategies from kernel-based learning. RESULTS: Our novel strategy was extensively evaluated using the leave-one-out cross validation strategy on fragments of variable length (800 bp - 50 Kbp) from 373 completely sequenced genomes. TACOA is able to classify genomic fragments of length 800 bp and 1 Kbp with high accuracy until rank class. For longer fragments $\backslash$textgreater or = 3 Kbp accurate predictions are made at even deeper taxonomic ranks (order and genus). Remarkably, TACOA also produces reliable results when the taxonomic origin of a fragment is not represented in the reference set, thus classifying such fragments to its known broader taxonomic class or simply as "unknown". We compared the classification accuracy of TACOA with the latest intrinsic classifier PhyloPythia using 63 recently published complete genomes. For fragments of length 800 bp and 1 Kbp the overall accuracy of TACOA is higher than that obtained by PhyloPythia at all taxonomic ranks. For all fragment lengths, both methods achieved comparable high specificity results up to rank class and low false negative rates are also obtained. CONCLUSION: An accurate multi-class taxonomic classifier was developed for environmental genomic fragments. TACOA can predict with high reliability the taxonomic origin of genomic fragments as short as 800 bp. The proposed method is transparent, fast, accurate and the reference set can be easily updated as newly sequenced genomes become available. Moreover, the method demonstrated to be competitive when compared to the most current classifier PhyloPythia and has the advantage that it can be locally installed and the reference set can be kept up-to-date.},
  timestamp = {2016-06-16T16:07:24Z},
  journaltitle = {BMC bioinformatics},
  author = {Diaz, Naryttza N and Krause, Lutz and Goesmann, Alexander and Niehaus, Karsten and Nattkemper, Tim W},
  date = {2009-01},
  pages = {56},
  keywords = {Algorithms,Archaea,Archaea: classification,Archaea: genetics,Bacteria,Bacteria: classification,Bacteria: genetics,Classification,Classification: methods,Cluster Analysis,Environmental Microbiology,Genome,Genomics,Genomics: methods,metagenomics,Software,Software Validation,taxonomic binning},
  eprinttype = {pmid},
  eprint = {19210774}
}

@article{DrogeTaxatortk2014,
  title = {Taxator-Tk: Precise Taxonomic Assignment of Metagenomes by Fast Approximation of Evolutionary Neighborhoods.},
  issn = {1367-4811},
  doi = {10.1093/bioinformatics/btu745},
  abstract = {MOTIVATION: Metagenomics characterizes microbial communities by random shotgun sequencing of DNA isolated directly from an environment of interest. An essential step in computational metagenome analysis is taxonomic sequence assignment, which allows identifying the sequenced community members and reconstructing taxonomic bins with sequence data for the individual taxa. For the massive datasets generated by next-generation sequencing technologies, this cannot be performed with de-novo phylogenetic inference methods. We describe an algorithm and the accompanying software, taxator-tk, which performs taxonomic sequence assignment by fast approximate determination of evolutionary neighbors from sequence similarities. RESULTS: Taxator-tk was precise in its taxonomic assignment across all ranks and taxa for a range of evolutionary distances and for short as well as for long sequences. In addition to the taxonomic binning of metagenomes, it is well suited for profiling microbial communities from metagenome samples because it identifies bacterial, archaeal and eukaryotic community members without being affected by varying primer binding strengths, as in marker gene amplification, or copy number variations of marker genes across different taxa. Taxator-tk has an efficient, parallelized implementation that allows the assignment of 6 Gb of sequence data per day on a standard multiprocessor system with 10 CPU cores and microbial RefSeq as the genomic reference data. Availability and implementation: Taxator-tk source and binary program files are publicly available at http://algbio.cs.uni-duesseldorf.de/software/. CONTACT: Alice.McHardy@uni-duesseldorf.de Supplementary information: Supplementary data are available at Bioinformatics online.},
  timestamp = {2016-07-06T12:35:17Z},
  eprinttype = {pubmed},
  eprint = {25388150},
  issue = {November 2014},
  journaltitle = {Bioinformatics (Oxford, England)},
  author = {Dröge, Johannes and Gregor, Ivan and McHardy, Alice C.},
  date = {2014},
  pages = {1--8}
}

@article{FinnHmmer2011,
  title = {{{HMMER}} Web Server: Interactive Sequence Similarity Searching.},
  volume = {39 Suppl 2},
  issn = {1362-4962},
  doi = {10.1093/nar/gkr367},
  abstract = {HMMER is a software suite for protein sequence similarity searches using probabilistic methods. Previously, HMMER has mainly been available only as a computationally intensive UNIX command-line tool, restricting its use. Recent advances in the software, HMMER3, have resulted in a 100-fold speed gain relative to previous versions. It is now feasible to make efficient profile hidden Markov model (profile HMM) searches via the web. A HMMER web server (http://hmmer.janelia.org) has been designed and implemented such that most protein database searches return within a few seconds. Methods are available for searching either a single protein sequence, multiple protein sequence alignment or profile HMM against a target sequence database, and for searching a protein sequence against Pfam. The web server is designed to cater to a range of different user expertise and accepts batch uploading of multiple queries at once. All search methods are also available as RESTful web services, thereby allowing them to be readily integrated as remotely executed tasks in locally scripted workflows. We have focused on minimizing search times and the ability to rapidly display tabular results, regardless of the number of matches found, developing graphical summaries of the search results to provide quick, intuitive appraisement of them.},
  timestamp = {2016-06-16T16:07:29Z},
  issue = {May},
  journaltitle = {Nucleic acids research},
  author = {Finn, Robert D and Clements, Jody and Eddy, Sean R},
  date = {2011-07},
  pages = {W29--37},
  eprinttype = {pmid},
  eprint = {21593126}
}

@article{FrithParameters2010,
  title = {Parameters for Accurate Genome Alignment.},
  volume = {11},
  issn = {1471-2105},
  doi = {10.1186/1471-2105-11-80},
  abstract = {BACKGROUND: Genome sequence alignments form the basis of much research. Genome alignment depends on various mundane but critical choices, such as how to mask repeats and which score parameters to use. Surprisingly, there has been no large-scale assessment of these choices using real genomic data. Moreover, rigorous procedures to control the rate of spurious alignment have not been employed. RESULTS: We have assessed 495 combinations of score parameters for alignment of animal, plant, and fungal genomes. As our gold-standard of accuracy, we used genome alignments implied by multiple alignments of proteins and of structural RNAs. We found the HOXD scoring schemes underlying alignments in the UCSC genome database to be far from optimal, and suggest better parameters. Higher values of the X-drop parameter are not always better. E-values accurately indicate the rate of spurious alignment, but only if tandem repeats are masked in a non-standard way. Finally, we show that gamma-centroid (probabilistic) alignment can find highly reliable subsets of aligned bases. CONCLUSIONS: These results enable more accurate genome alignment, with reliability measures for local alignments and for individual aligned bases. This study was made possible by our new software, LAST, which can align vertebrate genomes in a few hours http://last.cbrc.jp/.},
  timestamp = {2016-06-16T16:07:30Z},
  journaltitle = {BMC bioinformatics},
  author = {Frith, Martin C and Hamada, Michiaki and Horton, Paul},
  date = {2010-01},
  pages = {80},
  keywords = {Base Sequence,Computational Biology,Computational Biology: methods,Genome,Molecular Sequence Data,Proteins,Proteins: chemistry,RNA,RNA: chemistry,Sequence Alignment,Sequence Alignment: methods,Sequence Analysis},
  eprinttype = {pmid},
  eprint = {20144198}
}

@article{FuhrmanMicrobial2009,
  title = {Microbial Community Structure and Its Functional Implications.},
  volume = {459},
  issn = {1476-4687},
  doi = {10.1038/nature08058},
  abstract = {Marine microbial communities are engines of globally important processes, such as the marine carbon, nitrogen and sulphur cycles. Recent data on the structures of these communities show that they adhere to universal biological rules. Co-occurrence patterns can help define species identities, and systems-biology tools are revealing networks of interacting microorganisms. Some microbial systems are found to change predictably, helping us to anticipate how microbial communities and their activities will shift in a changing world.},
  timestamp = {2016-06-16T16:07:31Z},
  number = {7244},
  journaltitle = {Nature},
  author = {a Fuhrman, Jed},
  date = {2009-05},
  pages = {193--9},
  keywords = {Animals,Ecosystem,Genomics,Genomics: methods,Genomics: trends,Greenhouse Effect,Marine Biology,Water Microbiology},
  eprinttype = {pmid},
  eprint = {19444205}
}

@article{GerlachTaxonomic2011,
  title = {Taxonomic Classification of Metagenomic Shotgun Sequences with {{CARMA3}}.},
  issn = {1362-4962},
  doi = {10.1093/nar/gkr225},
  abstract = {The vast majority of microbes are unculturable and thus cannot be sequenced by means of traditional methods. High-throughput sequencing techniques like 454 or Solexa-Illumina make it possible to explore those microbes by studying whole natural microbial communities and analysing their biological diversity as well as the underlying metabolic pathways. Over the past few years, different methods have been developed for the taxonomic and functional characterization of metagenomic shotgun sequences. However, the taxonomic classification of metagenomic sequences from novel species without close homologue in the biological sequence databases poses a challenge due to the high number of wrong taxonomic predictions on lower taxonomic ranks. Here we present CARMA3, a new method for the taxonomic classification of assembled and unassembled metagenomic sequences that has been adapted to work with both BLAST and HMMER3 homology searches. We show that our method makes fewer wrong taxonomic predictions (at the same sensitivity) than other BLAST-based methods. CARMA3 is freely accessible via the web application WebCARMA from http://webcarma.cebitec.uni-bielefeld.de.},
  timestamp = {2016-06-16T16:07:33Z},
  journaltitle = {Nucleic acids research},
  author = {Gerlach, Wolfgang and Stoye, Jens},
  date = {2011-05},
  pages = {1--11},
  eprinttype = {pmid},
  eprint = {21586583}
}

@article{GregorPhylopythias2014,
  title = {{{PhyloPythiaS}}+: {{A}} Self-Training Method for the Rapid Reconstruction of Low-Ranking Taxonomic Bins from Metagenomes},
  abstract = {Metagenomics is an approach for characterizing environmental microbial communities in situ, it allows their functional and taxonomic characterization and to recover sequences from uncultured taxa. For communities of up to medium diversity, e.g. excluding environments such as soil, this is often achieved by a combination of sequence assembly and binning, where sequences are grouped into 'bins' representing taxa of the underlying microbial community from which they originate. Assignment to low-ranking taxonomic bins is an important challenge for binning methods as is scalability to Gb-sized datasets generated with deep sequencing techniques. One of the best available methods for the recovery of species bins from an individual metagenome sample is the expert-trained PhyloPythiaS package, where a human expert decides on the taxa to incorporate in a composition-based taxonomic metagenome classifier and identifies the 'training' sequences using marker genes directly from the sample. Due to the manual effort involved, this approach does not scale to multiple metagenome samples and requires substantial expertise, which researchers who are new to the area may not have. With these challenges in mind, we have developed PhyloPythiaS+, a successor to our previously described method PhyloPythia(S). The newly developed + component performs the work previously done by the human expert. PhyloPythiaS+ also includes a new k-mer counting algorithm, which accelerated k-mer counting 100-fold and reduced the overall execution time of the software by a factor of three. Our software allows to analyze Gb-sized metagenomes with inexpensive hardware, and to recover species or genera-level bins with low error rates in a fully automated fashion.},
  timestamp = {2016-06-16T16:07:38Z},
  journaltitle = {arxiv.org},
  author = {Gregor, I and Dröge, J. and Schirmer, M and Quince, C and McHardy, A. C.},
  date = {2014-06},
  pages = {1--67}
}

@article{HamadyMicrobial2009,
  title = {Microbial Community Profiling for Human Microbiome Projects: {{Tools}}, Techniques, and Challenges.},
  volume = {19},
  issn = {1088-9051},
  doi = {10.1101/gr.085464.108},
  abstract = {High-throughput sequencing studies and new software tools are revolutionizing microbial community analyses, yet the variety of experimental and computational methods can be daunting. In this review, we discuss some of the different approaches to community profiling, highlighting strengths and weaknesses of various experimental approaches, sequencing methodologies, and analytical methods. We also address one key question emerging from various Human Microbiome Projects: Is there a substantial core of abundant organisms or lineages that we all share? It appears that in some human body habitats, such as the hand and the gut, the diversity among individuals is so great that we can rule out the possibility that any species is at high abundance in all individuals: It is possible that the focus should instead be on higher-level taxa or on functional genes instead.},
  timestamp = {2016-06-16T16:07:38Z},
  number = {7},
  journaltitle = {Genome research},
  author = {Hamady, Micah and Knight, Rob},
  date = {2009-07},
  pages = {1141--52},
  keywords = {16S,16S: classification,16S: genetics,Gene Expression Profiling,Humans,Metagenome,Ribosomal,RNA},
  eprinttype = {pmid},
  eprint = {19383763}
}

@article{HandelsmanMetagenomics2004,
  title = {Metagenomics: Application of Genomics to Uncultured Microorganisms.},
  volume = {68},
  issn = {1092-2172},
  doi = {10.1128/MMBR.68.4.669-685.2004},
  abstract = {Metagenomics (also referred to as environmental and community genomics) is the genomic analysis of microorganisms by direct extraction and cloning of DNA from an assemblage of microorganisms. The development of metagenomics stemmed from the ineluctable evidence that as-yet-uncultured microorganisms represent the vast majority of organisms in most environments on earth. This evidence was derived from analyses of 16S rRNA gene sequences amplified directly from the environment, an approach that avoided the bias imposed by culturing and led to the discovery of vast new lineages of microbial life. Although the portrait of the microbial world was revolutionized by analysis of 16S rRNA genes, such studies yielded only a phylogenetic description of community membership, providing little insight into the genetics, physiology, and biochemistry of the members. Metagenomics provides a second tier of technical innovation that facilitates study of the physiology and ecology of environmental microorganisms. Novel genes and gene products discovered through metagenomics include the first bacteriorhodopsin of bacterial origin; novel small molecules with antimicrobial activity; and new members of families of known proteins, such as an Na(+)(Li(+))/H(+) antiporter, RecA, DNA polymerase, and antibiotic resistance determinants. Reassembly of multiple genomes has provided insight into energy and nutrient cycling within the community, genome structure, gene function, population genetics and microheterogeneity, and lateral gene transfer among members of an uncultured community. The application of metagenomic sequence information will facilitate the design of better culturing strategies to link genomic analysis with pure culture studies.},
  timestamp = {2016-06-16T16:07:39Z},
  number = {4},
  journaltitle = {Microbiology and molecular biology reviews : MMBR},
  author = {Handelsman, Jo},
  date = {2004-12},
  pages = {669--85},
  keywords = {Bacterial,Biotechnology,Ecology,Environmental Microbiology,Genetics,Genome,Genomics,Genomics: methods,Microbial},
  eprinttype = {pmid},
  eprint = {15590779}
}

@article{HauswedellLambda2014,
  title = {Lambda: The Local Aligner for Massive Biological Data},
  volume = {30},
  issn = {1367-4803},
  doi = {10.1093/bioinformatics/btu439},
  timestamp = {2016-06-16T16:07:39Z},
  number = {17},
  journaltitle = {Bioinformatics},
  author = {Hauswedell, H. and Singer, J. and Reinert, K.},
  date = {2014-08},
  pages = {i349--i355}
}

@article{HessMetagenomic2011,
  title = {Metagenomic Discovery of Biomass-Degrading Genes and Genomes from Cow Rumen.},
  volume = {331},
  issn = {1095-9203},
  doi = {10.1126/science.1200387},
  abstract = {The paucity of enzymes that efficiently deconstruct plant polysaccharides represents a major bottleneck for industrial-scale conversion of cellulosic biomass into biofuels. Cow rumen microbes specialize in degradation of cellulosic plant material, but most members of this complex community resist cultivation. To characterize biomass-degrading genes and genomes, we sequenced and analyzed 268 gigabases of metagenomic DNA from microbes adherent to plant fiber incubated in cow rumen. From these data, we identified 27,755 putative carbohydrate-active genes and expressed 90 candidate proteins, of which 57\% were enzymatically active against cellulosic substrates. We also assembled 15 uncultured microbial genomes, which were validated by complementary methods including single-cell genome sequencing. These data sets provide a substantially expanded catalog of genes and genomes participating in the deconstruction of cellulosic biomass.},
  timestamp = {2016-06-16T16:07:39Z},
  number = {6016},
  journaltitle = {Science (New York, N.Y.)},
  author = {Hess, Matthias and Sczyrba, Alexander and Egan, Rob and Kim, Tae-Wan and Chokhawala, Harshal and Schroth, Gary and Luo, Shujun and Clark, Douglas S and Chen, Feng and Zhang, Tao and Mackie, Roderick I and a Pennacchio, Len and Tringe, Susannah G and Visel, Axel and Woyke, Tanja and Wang, Zhong and Rubin, Edward M},
  date = {2011-01},
  pages = {463--7},
  keywords = {4-beta-Cellobiosidase,4-beta-Cellobiosidase: genetics,4-beta-Cellobiosidase: metabolism,Amino Acid Sequence,Animals,Bacteria,Bacteria: enzymology,Bacteria: genetics,Bacteria: isolation & purification,Bacterial,Bacterial Proteins,Bacterial Proteins: chemistry,Bacterial Proteins: genetics,Bacterial Proteins: metabolism,Bacteria: metabolism,Biomass,Carbohydrate Metabolism,Cattle,Cattle: microbiology,Cellulase,Cellulase: genetics,Cellulase: metabolism,Cellulases,Cellulases: chemistry,Cellulases: genetics,Cellulases: metabolism,Cellulose,Cellulose 1,Cellulose: metabolism,DNA,Genes,Genome,Metagenome,metagenomics,Metagenomics: methods,Molecular Sequence Annotation,Molecular Sequence Data,Poaceae,Poaceae: microbiology,Rumen,Rumen: metabolism,Rumen: microbiology,Sequence Analysis},
  eprinttype = {pmid},
  eprint = {21273488}
}

@article{HuPirs2012,
  title = {{{pIRS}}: {{Profile}}-Based {{Illumina}} Pair-End Reads Simulator.},
  volume = {28},
  issn = {1367-4811},
  doi = {10.1093/bioinformatics/bts187},
  abstract = {The next-generation high-throughput sequencing technologies, especially from Illumina, have been widely used in re-sequencing and de novo assembly studies. However, there is no existing software that can simulate Illumina reads with real error and quality distributions and coverage bias yet, which is very useful in relevant software development and study designing of sequencing projects.},
  timestamp = {2016-06-16T16:07:40Z},
  number = {11},
  journaltitle = {Bioinformatics (Oxford, England)},
  author = {Hu, Xuesong and Yuan, Jianying and Shi, Yujian and Lu, Jianliang and Liu, Binghang and Li, Zhenyu and Chen, Yanxiang and Mu, Desheng and Zhang, Hao and Li, Nan and Yue, Zhen and Bai, Fan and Li, Heng and Fan, Wei},
  date = {2012-06},
  pages = {1533--5},
  eprinttype = {pmid},
  eprint = {22508794}
}

@article{HuangArt2012,
  title = {{{ART}}: A next-Generation Sequencing Read Simulator.},
  volume = {28},
  issn = {1367-4811},
  doi = {10.1093/bioinformatics/btr708},
  abstract = {ART is a set of simulation tools that generate synthetic next-generation sequencing reads. This functionality is essential for testing and benchmarking tools for next-generation sequencing data analysis including read alignment, de novo assembly and genetic variation discovery. ART generates simulated sequencing reads by emulating the sequencing process with built-in, technology-specific read error models and base quality value profiles parameterized empirically in large sequencing datasets. We currently support all three major commercial next-generation sequencing platforms: Roche's 454, Illumina's Solexa and Applied Biosystems' SOLiD. ART also allows the flexibility to use customized read error model parameters and quality profiles.},
  timestamp = {2017-02-17T20:48:09Z},
  number = {4},
  journaltitle = {Bioinformatics (Oxford, England)},
  author = {Huang, Weichun and Li, Leping and Myers, Jason R and Marth, Gabor T},
  date = {2012-02},
  pages = {593--4},
  eprinttype = {pmid},
  eprint = {22199392}
}

@article{HugenholtzExploring2002,
  title = {Exploring Prokaryotic Diversity in the Genomic Era.},
  volume = {3},
  issn = {1465-6914},
  abstract = {Our understanding of prokaryote biology from study of pure cultures and genome sequencing has been limited by a pronounced sampling bias towards four bacterial phyla - Proteobacteria, Firmicutes, Actinobacteria and Bacteroidetes - out of 35 bacterial and 18 archaeal phylum-level lineages. This bias is beginning to be rectified by the use of phylogenetically directed isolation strategies and by directly accessing microbial genomes from environmental samples.},
  timestamp = {2016-06-16T16:07:40Z},
  number = {2},
  journaltitle = {Genome biology},
  author = {Hugenholtz, Philip},
  date = {2002-01},
  pages = {REVIEWS0003},
  keywords = {Actinobacteria,Actinobacteria: genetics,Actinobacteria: isolation & purification,Bacterial,Bacterial: genetics,Bacteroidaceae,Bacteroidaceae: genetics,Bacteroidaceae: isolation & purification,Genes,Genetic Variation,Genetic Variation: genetics,Genome,Gram-Positive Bacteria,Gram-Positive Bacteria: genetics,Gram-Positive Bacteria: isolation & purification,Phylogeny,Proteobacteria,Proteobacteria: genetics,Proteobacteria: isolation & purification},
  eprinttype = {pmid},
  eprint = {11864374}
}

@article{HuseExploring2008,
  title = {Exploring Microbial Diversity and Taxonomy Using {{SSU rRNA}} Hypervariable Tag Sequencing.},
  volume = {4},
  issn = {1553-7404},
  doi = {10.1371/journal.pgen.1000255},
  abstract = {Massively parallel pyrosequencing of hypervariable regions from small subunit ribosomal RNA (SSU rRNA) genes can sample a microbial community two or three orders of magnitude more deeply per dollar and per hour than capillary sequencing of full-length SSU rRNA. As with full-length rRNA surveys, each sequence read is a tag surrogate for a single microbe. However, rather than assigning taxonomy by creating gene trees de novo that include all experimental sequences and certain reference taxa, we compare the hypervariable region tags to an extensive database of rRNA sequences and assign taxonomy based on the best match in a Global Alignment for Sequence Taxonomy (GAST) process. The resulting taxonomic census provides information on both composition and diversity of the microbial community. To determine the effectiveness of using only hypervariable region tags for assessing microbial community membership, we compared the taxonomy assigned to the V3 and V6 hypervariable regions with the taxonomy assigned to full-length SSU rRNA sequences isolated from both the human gut and a deep-sea hydrothermal vent. The hypervariable region tags and full-length rRNA sequences provided equivalent taxonomy and measures of relative abundance of microbial communities, even for tags up to 15\% divergent from their nearest reference match. The greater sampling depth per dollar afforded by massively parallel pyrosequencing reveals many more members of the "rare biosphere" than does capillary sequencing of the full-length gene. In addition, tag sequencing eliminates cloning bias and the sequences are short enough to be completely sequenced in a single read, maximizing the number of organisms sampled in a run while minimizing chimera formation. This technique allows the cost-effective exploration of changes in microbial community structure, including the rare biosphere, over space and time and can be applied immediately to initiatives, such as the Human Microbiome Project.},
  timestamp = {2016-06-16T16:07:42Z},
  number = {11},
  journaltitle = {PLoS genetics},
  author = {Huse, Susan M and Dethlefsen, Les and Huber, Julie A and Welch, David Mark and Relman, David A and Sogin, Mitchell L},
  date = {2008-11},
  pages = {e1000255},
  keywords = {Bacteria,Bacteria: classification,Bacteria: genetics,Biodiversity,Classification,Classification: methods,DNA,Humans,Metagenome,Metagenome: genetics,Ribosomal,Ribosomal: genetics,RNA,Sequence Analysis,Sequence Tagged Sites},
  eprinttype = {pmid},
  eprint = {19023400}
}

@article{HusonIntegrative2011,
  title = {Integrative Analysis of Environmental Sequences Using {{MEGAN4}}.},
  volume = {21},
  issn = {1549-5469},
  doi = {10.1101/gr.120618.111},
  abstract = {A major challenge in the analysis of environmental sequences is data integration. The question is how to analyze different types of data in a unified approach, addressing both the taxonomic and functional aspects. To facilitate such analyses, we have substantially extended MEGAN, a widely used taxonomic analysis program. The new program, MEGAN4, provides an integrated approach to the taxonomic and functional analysis of metagenomic, metatranscriptomic, metaproteomic, and rRNA data. While taxonomic analysis is performed based on the NCBI taxonomy, functional analysis is performed using the SEED classification of subsystems and functional roles or the KEGG classification of pathways and enzymes. A number of examples illustrate how such analyses can be performed, and show that one can also import and compare classification results obtained using others' tools. MEGAN4 is freely available for academic purposes, and installers for all three major operating systems can be downloaded from www-ab.informatik.uni-tuebingen.de/software/megan.},
  timestamp = {2016-06-16T16:07:43Z},
  number = {9},
  journaltitle = {Genome research},
  author = {Huson, Daniel H and Mitra, Suparna and Ruscheweyh, Hans-Joachim and Weber, Nico and Schuster, Stephan C},
  date = {2011-09},
  pages = {1552--60},
  keywords = {16S,16S: genetics,Classification,Metagenome,Metagenome: genetics,metagenomics,Proteomics,Ribosomal,RNA,Software,Transcriptome},
  eprinttype = {pmid},
  eprint = {21690186}
}

@article{HusonPoor2014,
  title = {A Poor Man's {{BLASTX}}–high-Throughput Metagenomic Protein Database Search Using {{PAUDA}}.},
  volume = {30},
  issn = {1367-4811},
  doi = {10.1093/bioinformatics/btt254},
  abstract = {SUMMARY: In the context of metagenomics, we introduce a new approach to protein database search called PAUDA, which runs ∼10,000 times faster than BLASTX, while achieving about one-third of the assignment rate of reads to KEGG orthology groups, and producing gene and taxon abundance profiles that are highly correlated to those obtained with BLASTX. PAUDA requires $\backslash$textless80 CPU hours to analyze a dataset of 246 million Illumina DNA reads from permafrost soil for which a previous BLASTX analysis (on a subset of 176 million reads) reportedly required 800,000 CPU hours, leading to the same clustering of samples by functional profiles. AVAILABILITY: PAUDA is freely available from: http://ab.inf.uni-tuebingen.de/software/pauda. Also supplementary method details are available from this website.},
  timestamp = {2016-06-16T16:07:43Z},
  number = {1},
  journaltitle = {Bioinformatics (Oxford, England)},
  author = {Huson, Daniel H and Xie, Chao},
  date = {2014-01},
  pages = {38--9},
  keywords = {Algorithms,Amino Acid Sequence,Base Sequence,Databases,DNA,metagenomics,Metagenomics: methods,Protein,Sequence Analysis,Software},
  eprinttype = {pmid},
  eprint = {23658416}
}

@article{ImelfortGroopm2014,
  title = {{{GroopM}}: An Automated Tool for the Recovery of Population Genomes from Related Metagenomes},
  volume = {2},
  issn = {2167-8359},
  doi = {10.7717/peerj.603},
  timestamp = {2016-06-16T16:07:43Z},
  journaltitle = {PeerJ},
  author = {Imelfort, Michael and Parks, Donovan and Woodcroft, Ben J. and Dennis, Paul and Hugenholtz, Philip and Tyson, Gene W.},
  date = {2014-09},
  pages = {e603},
  keywords = {bioinformatics,metagenomics,microbial ecology,population genome binning}
}

@article{IversonUntangling2012,
  title = {Untangling Genomes from Metagenomes: Revealing an Uncultured Class of Marine {{Euryarchaeota}}},
  volume = {335},
  issn = {0036-8075},
  doi = {10.1126/science.1212665},
  timestamp = {2016-06-16T16:07:43Z},
  number = {6068},
  journaltitle = {Science},
  author = {Iverson, V. and Morris, R. M. and Frazar, C. D. and Berthiaume, C. T. and Morales, R. L. and Armbrust, E. V.},
  date = {2012-02},
  pages = {587--590}
}

@article{JeewonDetection2007,
  title = {Detection and Diversity of Fungi from Environmental Samples: Traditional versus Molecular Approaches},
  volume = {11},
  timestamp = {2016-06-16T16:07:44Z},
  journaltitle = {Advanced techniques in soil microbiology},
  author = {Jeewon, R and Hyde, K. D.},
  date = {2007},
  pages = {1--15}
}

@article{KarlinDinucleotide1995,
  title = {Dinucleotide Relative Abundance Extremes: A Genomic Signature.},
  volume = {11},
  issn = {0168-9525},
  abstract = {Early biochemical experiments established that the set of dinucleotide odds ratios or 'general design' is a remarkably stable property of the DNA of an organism, which is essentially the same in protein-coding DNA, bulk genomic DNA, and in different renaturation rate and density gradient fractions of genomic DNA in many organisms. Analysis of currently available genomic sequence data has extended these earlier results, showing that the general designs of disjoint samples of a genome are substantially more similar to each other than to those of sequences from other organisms and that closely related organisms have similar general designs. From this perspective, the set of dinucleotide odds ratio (relative abundance) values constitute a signature of each DNA genome, which can discriminate between sequences from different organisms. Dinucleotide-odds ratio values appear to reflect not only the chemistry of dinucleotide stacking energies and base-step conformational preferences, but also the species-specific properties of DNA modification, replication and repair mechanisms.},
  timestamp = {2016-06-16T16:07:44Z},
  number = {7},
  journaltitle = {Trends in genetics},
  author = {Karlin, S and Burge, C},
  date = {1995-07},
  pages = {283--90},
  keywords = {Animals,CpG Islands,Dinucleotide Repeats,DNA,DNA: genetics,Genome},
  eprinttype = {pmid},
  eprint = {7482779}
}

@article{KislyukUnsupervised2009,
  title = {Unsupervised Statistical Clustering of Environmental Shotgun Sequences.},
  volume = {10},
  issn = {1471-2105},
  doi = {10.1186/1471-2105-10-316},
  abstract = {The development of effective environmental shotgun sequence binning methods remains an ongoing challenge in algorithmic analysis of metagenomic data. While previous methods have focused primarily on supervised learning involving extrinsic data, a first-principles statistical model combined with a self-training fitting method has not yet been developed.},
  timestamp = {2017-02-17T20:47:53Z},
  journaltitle = {BMC bioinformatics},
  author = {Kislyuk, Andrey and Bhatnagar, Srijak and Dushoff, Jonathan and Weitz, Joshua S},
  date = {2009-01},
  pages = {316},
  keywords = {Algorithms,Cluster Analysis,DNA,DNA: methods,Genome,Genomics,Genomics: methods,metagenomics,Sequence Analysis,taxonomic binning,unsupervised taxonomic binning},
  eprinttype = {pmid},
  eprint = {19799776}
}

@article{KlumppNext2012,
  title = {Next Generation Sequencing Technologies and the Changing Landscape of Phage Genomics.},
  volume = {2},
  issn = {2159-7073},
  doi = {10.4161/bact.22111},
  abstract = {The dawn of next generation sequencing technologies has opened up exciting possibilities for whole genome sequencing of a plethora of organisms. The 2nd and 3rd generation sequencing technologies, based on cloning-free, massively parallel sequencing, have enabled the generation of a deluge of genomic sequences of both prokaryotic and eukaryotic origin in the last seven years. However, whole genome sequencing of bacterial viruses has not kept pace with this revolution, despite the fact that their genomes are orders of magnitude smaller in size compared with bacteria and other organisms. Sequencing phage genomes poses several challenges; (1) obtaining pure phage genomic material, (2) PCR amplification biases and (3) complex nature of their genetic material due to features such as methylated bases and repeats that are inherently difficult to sequence and assemble. Here we describe conclusions drawn from our efforts in sequencing hundreds of bacteriophage genomes from a variety of Gram-positive and Gram-negative bacteria using Sanger, 454, Illumina and PacBio technologies. Based on our experience we propose several general considerations regarding sample quality, the choice of technology and a "blended approach" for generating reliable whole genome sequences of phages.},
  timestamp = {2016-06-16T16:07:47Z},
  number = {3},
  journaltitle = {Bacteriophage},
  author = {Klumpp, Jochen and Fouts, Derrick E and Sozhamannan, Shanmuga},
  date = {2012-07},
  pages = {190--199},
  keywords = {assembly,bacteriophage genome,hybrid genome,illumina hiseq,PacBio,roche 454,sanger sequencing,scaffolding,sispa},
  eprinttype = {pmid},
  eprint = {23275870}
}

@article{KorenBambus2011,
  title = {Bambus 2: {{Scaffolding}} Metagenomes},
  volume = {27},
  issn = {1367-4803},
  doi = {10.1093/bioinformatics/btr520},
  timestamp = {2016-06-16T16:07:47Z},
  number = {21},
  journaltitle = {Bioinformatics},
  author = {Koren, S. and Treangen, T. J. and Pop, M.},
  date = {2011-09},
  pages = {2964--2971}
}

@article{KoslickiQuikr2013,
  title = {Quikr: A Method for Rapid Reconstruction of Bacterial Communities via Compressive Sensing.},
  volume = {29},
  issn = {1367-4811},
  doi = {10.1093/bioinformatics/btt336},
  abstract = {MOTIVATION: Many metagenomic studies compare hundreds to thousands of environmental and health-related samples by extracting and sequencing their 16S rRNA amplicons and measuring their similarity using beta-diversity metrics. However, one of the first steps–to classify the operational taxonomic units within the sample–can be a computationally time-consuming task because most methods rely on computing the taxonomic assignment of each individual read out of tens to hundreds of thousands of reads. RESULTS: We introduce Quikr: a QUadratic, K-mer-based, Iterative, Reconstruction method, which computes a vector of taxonomic assignments and their proportions in the sample using an optimization technique motivated from the mathematical theory of compressive sensing. On both simulated and actual biological data, we demonstrate that Quikr typically has less error and is typically orders of magnitude faster than the most commonly used taxonomic assignment technique (the Ribosomal Database Project's Naïve Bayesian Classifier). Furthermore, the technique is shown to be unaffected by the presence of chimeras, thereby allowing for the circumvention of the time-intensive step of chimera filtering. AVAILABILITY: The Quikr computational package (in MATLAB, Octave, Python and C) for the Linux and Mac platforms is available at http://sourceforge.net/projects/quikr/.},
  timestamp = {2016-06-16T16:07:48Z},
  number = {17},
  journaltitle = {Bioinformatics (Oxford, England)},
  author = {Koslicki, David and Foucart, Simon and Rosen, Gail},
  date = {2013-09},
  pages = {2096--102},
  keywords = {16S,16S: genetics,Algorithms,Bacteria,Bacteria: classification,Bacteria: genetics,Bacteria: isolation & purification,Bayes Theorem,Classification,Classification: methods,DNA,DNA: methods,metagenomics,Microbiota,Phylogeny,Ribosomal,RNA,Sequence Analysis,Software},
  eprinttype = {pmid},
  eprint = {23786768}
}

@article{LasersonGenovo2011,
  title = {Genovo: De Novo Assembly for Metagenomes.},
  volume = {18},
  issn = {1557-8666},
  doi = {10.1089/cmb.2010.0244},
  abstract = {Next-generation sequencing technologies produce a large number of noisy reads from the DNA in a sample. Metagenomics and population sequencing aim to recover the genomic sequences of the species in the sample, which could be of high diversity. Methods geared towards single sequence reconstruction are not sensitive enough when applied in this setting. We introduce a generative probabilistic model of read generation from environmental samples and present Genovo, a novel de novo sequence assembler that discovers likely sequence reconstructions under the model. A nonparametric prior accounts for the unknown number of genomes in the sample. Inference is performed by applying a series of hill-climbing steps iteratively until convergence. We compare the performance of Genovo to three other short read assembly programs in a series of synthetic experiments and across nine metagenomic datasets created using the 454 platform, the largest of which has 311k reads. Genovo's reconstructions cover more bases and recover more genes than the other methods, even for low-abundance sequences, and yield a higher assembly score. Supplementary Material is available at www.liebertoinline.com/cmb .},
  timestamp = {2016-06-16T16:07:51Z},
  number = {3},
  journaltitle = {Journal of computational biology},
  author = {Laserson, Jonathan and Jojic, Vladimir and Koller, Daphne},
  date = {2011-03},
  pages = {429--43},
  keywords = {Algorithms,Animals,DNA,DNA: methods,Humans,Metagenome,metagenomics,Metagenomics: methods,Models,Sequence Analysis,Statistical},
  eprinttype = {pmid},
  eprint = {21385045}
}

@article{LindnerMetagenomic2013,
  title = {Metagenomic Abundance Estimation and Diagnostic Testing on Species Level.},
  volume = {41},
  issn = {1362-4962},
  doi = {10.1093/nar/gks803},
  abstract = {One goal of sequencing-based metagenomic community analysis is the quantitative taxonomic assessment of microbial community compositions. In particular, relative quantification of taxons is of high relevance for metagenomic diagnostics or microbial community comparison. However, the majority of existing approaches quantify at low resolution (e.g. at phylum level), rely on the existence of special genes (e.g. 16S), or have severe problems discerning species with highly similar genome sequences. Yet, problems as metagenomic diagnostics require accurate quantification on species level. We developed Genome Abundance Similarity Correction (GASiC), a method to estimate true genome abundances via read alignment by considering reference genome similarities in a non-negative LASSO approach. We demonstrate GASiC's superior performance over existing methods on simulated benchmark data as well as on real data. In addition, we present applications to datasets of both bacterial DNA and viral RNA source. We further discuss our approach as an alternative to PCR-based DNA quantification.},
  timestamp = {2016-06-16T16:07:51Z},
  number = {1},
  journaltitle = {Nucleic acids research},
  author = {Lindner, Martin S and Renard, Bernhard Y},
  date = {2013-01},
  pages = {e10},
  keywords = {Algorithms,Bacterial,Bacterial: analysis,Bacterial: chemistry,Classification,Classification: methods,DNA,Escherichia coli,Escherichia coli: genetics,metagenomics,Metagenomics: methods,RNA,Sequence Alignment,Viral,Viral: analysis,Viral: chemistry},
  eprinttype = {pmid},
  eprint = {22941661}
}

@article{LucksGenome2008,
  title = {Genome Landscapes and Bacteriophage Codon Usage.},
  volume = {4},
  issn = {1553-7358},
  doi = {10.1371/journal.pcbi.1000001},
  abstract = {Across all kingdoms of biological life, protein-coding genes exhibit unequal usage of synonymous codons. Although alternative theories abound, translational selection has been accepted as an important mechanism that shapes the patterns of codon usage in prokaryotes and simple eukaryotes. Here we analyze patterns of codon usage across 74 diverse bacteriophages that infect E. coli, P. aeruginosa, and L. lactis as their primary host. We use the concept of a "genome landscape," which helps reveal non-trivial, long-range patterns in codon usage across a genome. We develop a series of randomization tests that allow us to interrogate the significance of one aspect of codon usage, such as GC content, while controlling for another aspect, such as adaptation to host-preferred codons. We find that 33 phage genomes exhibit highly non-random patterns in their GC3-content, use of host-preferred codons, or both. We show that the head and tail proteins of these phages exhibit significant bias towards host-preferred codons, relative to the non-structural phage proteins. Our results support the hypothesis of translational selection on viral genes for host-preferred codons, over a broad range of bacteriophages.},
  timestamp = {2016-06-16T16:07:53Z},
  number = {2},
  journaltitle = {PLoS computational biology},
  author = {Lucks, Julius B and Nelson, David R and Kudla, Grzegorz R and Plotkin, Joshua B},
  date = {2008-02},
  pages = {e1000001},
  keywords = {Bacteriophages,Bacteriophages: genetics,Biological Evolution,Chromosome Mapping,Chromosome Mapping: methods,Computer Simulation,Evolution,Genes,Genetic,Genetic Variation,Genetic Variation: genetics,Genome,Models,Molecular,Species Specificity,Viral,Viral: genetics},
  eprinttype = {pmid},
  eprint = {18463708}
}

@article{LuoSoapdenovo22012,
  title = {{{SOAPdenovo2}}: An Empirically Improved Memory-Efficient Short-Read de Novo Assembler.},
  volume = {1},
  issn = {2047-217X},
  doi = {10.1186/2047-217X-1-18},
  abstract = {BACKGROUND: There is a rapidly increasing amount of de novo genome assembly using next-generation sequencing (NGS) short reads; however, several big challenges remain to be overcome in order for this to be efficient and accurate. SOAPdenovo has been successfully applied to assemble many published genomes, but it still needs improvement in continuity, accuracy and coverage, especially in repeat regions. FINDINGS: To overcome these challenges, we have developed its successor, SOAPdenovo2, which has the advantage of a new algorithm design that reduces memory consumption in graph construction, resolves more repeat regions in contig assembly, increases coverage and length in scaffold construction, improves gap closing, and optimizes for large genome. CONCLUSIONS: Benchmark using the Assemblathon1 and GAGE datasets showed that SOAPdenovo2 greatly surpasses its predecessor SOAPdenovo and is competitive to other assemblers on both assembly length and accuracy. We also provide an updated assembly version of the 2008 Asian (YH) genome using SOAPdenovo2. Here, the contig and scaffold N50 of the YH genome were ∼20.9 kbp and ∼22 Mbp, respectively, which is 3-fold and 50-fold longer than the first published version. The genome coverage increased from 81.16\% to 93.91\%, and memory consumption was ∼2/3 lower during the point of largest memory consumption.},
  timestamp = {2016-06-16T16:07:53Z},
  number = {1},
  journaltitle = {GigaScience},
  author = {Luo, Ruibang and Liu, Binghang and Xie, Yinlong and Li, Zhenyu and Huang, Weihua and Yuan, Jianying and He, Guangzhu and Chen, Yanxiang and Pan, Qi and Liu, Yunjie and Tang, Jingbo and Wu, Gengxiong and Zhang, Hao and Shi, Yujian and Liu, Yong and Yu, Chang and Wang, Bo and Lu, Yao and Han, Changlei and Cheung, David W and Yiu, Siu-Ming and Peng, Shaoliang and Xiaoqian, Zhu and Liu, Guangming and Liao, Xiangke and Li, Yingrui and Yang, Huanming and Wang, Jian and Lam, Tak-Wah and Wang, Jun},
  date = {2012-01},
  pages = {18},
  keywords = {assembly,contig,error correction,gap-filling,Genome,scaffold},
  eprinttype = {pmid},
  eprint = {23587118}
}

@article{MackelprangMetagenomic2011,
  title = {Metagenomic Analysis of a Permafrost Microbial Community Reveals a Rapid Response to Thaw},
  volume = {480},
  issn = {0028-0836},
  doi = {10.1038/nature10576},
  timestamp = {2016-06-16T16:07:56Z},
  number = {7377},
  journaltitle = {Nature},
  author = {Mackelprang, Rachel and Waldrop, Mark P. and DeAngelis, Kristen M. and David, Maude M. and Chavarria, Krystle L. and Blazewicz, Steven J. and Rubin, Edward M. and Jansson, Janet K.},
  date = {2011-11},
  pages = {368--371}
}

@article{MardisImpact2008,
  title = {The Impact of Next-Generation Sequencing Technology on Genetics.},
  volume = {24},
  issn = {0168-9525},
  doi = {10.1016/j.tig.2007.12.007},
  abstract = {If one accepts that the fundamental pursuit of genetics is to determine the genotypes that explain phenotypes, the meteoric increase of DNA sequence information applied toward that pursuit has nowhere to go but up. The recent introduction of instruments capable of producing millions of DNA sequence reads in a single run is rapidly changing the landscape of genetics, providing the ability to answer questions with heretofore unimaginable speed. These technologies will provide an inexpensive, genome-wide sequence readout as an endpoint to applications ranging from chromatin immunoprecipitation, mutation mapping and polymorphism discovery to noncoding RNA discovery. Here I survey next-generation sequencing technologies and consider how they can provide a more complete picture of how the genome shapes the organism.},
  timestamp = {2016-06-16T16:07:57Z},
  number = {3},
  journaltitle = {Trends in genetics : TIG},
  author = {Mardis, Elaine R},
  date = {2008-03},
  pages = {133--41},
  keywords = {Animals,DNA,Forecasting,Genetics,Genetics: trends,Humans,Sequence Analysis},
  eprinttype = {pmid},
  eprint = {18262675}
}

@article{MatsenPplacer2010,
  title = {Pplacer: Linear Time Maximum-Likelihood and {{Bayesian}} Phylogenetic Placement of Sequences onto a Fixed Reference Tree.},
  volume = {11},
  issn = {1471-2105},
  doi = {10.1186/1471-2105-11-538},
  abstract = {Likelihood-based phylogenetic inference is generally considered to be the most reliable classification method for unknown sequences. However, traditional likelihood-based phylogenetic methods cannot be applied to large volumes of short reads from next-generation sequencing due to computational complexity issues and lack of phylogenetic signal. "Phylogenetic placement," where a reference tree is fixed and the unknown query sequences are placed onto the tree via a reference alignment, is a way to bring the inferential power offered by likelihood-based approaches to large data sets.},
  timestamp = {2016-06-16T16:07:58Z},
  number = {1},
  journaltitle = {BMC bioinformatics},
  author = {Matsen, Frederick A and Kodner, Robin B and Armbrust, E Virginia},
  date = {2010-01},
  pages = {538},
  keywords = {Base Sequence,Bayes Theorem,Computational Biology,Computational Biology: methods,Likelihood Functions,metagenomics,phylogenetics,Phylogeny,Sequence Alignment,Sequence Alignment: methods,Software},
  eprinttype = {pmid},
  eprint = {21034504}
}

@article{MavromatisUse2007,
  title = {Use of Simulated Data Sets to Evaluate the Fidelity of Metagenomic Processing Methods.},
  volume = {4},
  issn = {1548-7091},
  doi = {10.1038/nmeth1043},
  abstract = {Metagenomics is a rapidly emerging field of research for studying microbial communities. To evaluate methods presently used to process metagenomic sequences, we constructed three simulated data sets of varying complexity by combining sequencing reads randomly selected from 113 isolate genomes. These data sets were designed to model real metagenomes in terms of complexity and phylogenetic composition. We assembled sampled reads using three commonly used genome assemblers (Phrap, Arachne and JAZZ), and predicted genes using two popular gene-finding pipelines (fgenesb and CRITICA/GLIMMER). The phylogenetic origins of the assembled contigs were predicted using one sequence similarity-based (blast hit distribution) and two sequence composition-based (PhyloPythia, oligonucleotide frequencies) binning methods. We explored the effects of the simulated community structure and method combinations on the fidelity of each processing step by comparison to the corresponding isolate genomes. The simulated data sets are available online to facilitate standardized benchmarking of tools for metagenomic analysis.},
  timestamp = {2016-06-16T16:07:59Z},
  number = {6},
  journaltitle = {Nature methods},
  author = {Mavromatis, Konstantinos and Ivanova, Natalia and Barry, Kerrie and Shapiro, Harris and Goltsman, Eugene and McHardy, Alice Carolyn and Rigoutsos, Isidore and Salamov, Asaf and Korzeniewski, Frank and Land, Miriam and Lapidus, Alla and Grigoriev, Igor and Richardson, Paul and Hugenholtz, Philip and Kyrpides, Nikos C},
  date = {2007-06},
  pages = {495--500},
  keywords = {Bacterial,Bacterial: genetics,Cluster Analysis,Computational Biology,Computational Biology: methods,Computational Biology: standards,Computer Simulation,Databases,Genetic,Genome,Genomics,Genomics: methods,Genomics: standards,Phylogeny,Software},
  eprinttype = {pmid},
  eprint = {17468765}
}

@article{MchardyAccurate2007,
  title = {Accurate Phylogenetic Classification of Variable-Length {{DNA}} Fragments.},
  volume = {4},
  issn = {1548-7091},
  doi = {10.1038/nmeth976},
  abstract = {Metagenome studies have retrieved vast amounts of sequence data from a variety of environments leading to new discoveries and insights into the uncultured microbial world. Except for very simple communities, the encountered diversity has made fragment assembly and the subsequent analysis a challenging problem. A taxonomic characterization of metagenomic fragments is required for a deeper understanding of shotgun-sequenced microbial communities, but success has mostly been limited to sequences containing phylogenetic marker genes. Here we present PhyloPythia, a composition-based classifier that combines higher-level generic clades from a set of 340 completed genomes with sample-derived population models. Extensive analyses on synthetic and real metagenome data sets showed that PhyloPythia allows the accurate classification of most sequence fragments across all considered taxonomic ranks, even for unknown organisms. The method requires no more than 100 kb of training sequence for the creation of accurate models of sample-specific populations and can assign fragments $\backslash$textgreateror=1 kb with high specificity.},
  timestamp = {2016-06-16T16:07:59Z},
  number = {1},
  journaltitle = {Nature methods},
  author = {McHardy, Alice Carolyn and Martín, Héctor García and Tsirigos, Aristotelis and Hugenholtz, Philip and Rigoutsos, Isidore},
  date = {2007-01},
  pages = {63--72},
  keywords = {Animals,Archaea,Archaea: genetics,Arthropods,Arthropods: genetics,Ascomycota,Ascomycota: genetics,Bacteria,Bacteria: genetics,Chordata,Chordata: genetics,DNA,DNA: chemistry,DNA: classification,DNA: genetics,Eukaryotic Cells,Genome,Genomics,Genomics: methods,Industrial Waste,Phylogeny,Sargassum,Sargassum: microbiology,Software Validation},
  eprinttype = {pmid},
  eprint = {17179938}
}

@article{MetzkerSequencing2009,
  title = {Sequencing Technologies — the next Generation},
  volume = {11},
  issn = {1471-0056},
  doi = {10.1038/nrg2626},
  timestamp = {2016-06-16T16:08:00Z},
  number = {1},
  journaltitle = {Nature reviews genetics},
  author = {Metzker, Michael L.},
  date = {2009-12},
  pages = {31--46}
}

@article{MeyerMetagenomics2008,
  title = {The Metagenomics {{RAST}} Server - a Public Resource for the Automatic Phylogenetic and Functional Analysis of Metagenomes.},
  volume = {9},
  issn = {1471-2105},
  doi = {10.1186/1471-2105-9-386},
  abstract = {BACKGROUND: Random community genomes (metagenomes) are now commonly used to study microbes in different environments. Over the past few years, the major challenge associated with metagenomics shifted from generating to analyzing sequences. High-throughput, low-cost next-generation sequencing has provided access to metagenomics to a wide range of researchers. RESULTS: A high-throughput pipeline has been constructed to provide high-performance computing to all researchers interested in using metagenomics. The pipeline produces automated functional assignments of sequences in the metagenome by comparing both protein and nucleotide databases. Phylogenetic and functional summaries of the metagenomes are generated, and tools for comparative metagenomics are incorporated into the standard views. User access is controlled to ensure data privacy, but the collaborative environment underpinning the service provides a framework for sharing datasets between multiple users. In the metagenomics RAST, all users retain full control of their data, and everything is available for download in a variety of formats. CONCLUSION: The open-source metagenomics RAST service provides a new paradigm for the annotation and analysis of metagenomes. With built-in support for multiple data sources and a back end that houses abstract data types, the metagenomics RAST is stable, extensible, and freely available to all researchers. This service has removed one of the primary bottlenecks in metagenome sequence analysis - the availability of high-performance computing for annotating the data. http://metagenomics.nmpdr.org.},
  timestamp = {2016-06-16T16:08:02Z},
  journaltitle = {BMC bioinformatics},
  author = {Meyer, Folker and Paarmann, D and D'Souza, M and Olson, R and Glass, E M and Kubal, M and Paczian, T and Rodriguez, A and Stevens, R and Wilke, A and Wilkening, J and a Edwards, R},
  date = {2008-01},
  pages = {386},
  keywords = {Algorithms,Database Management Systems,Databases,Genetic,Information Storage and Retrieval,Information Storage and Retrieval: methods,Internet,Phylogeny,Proteome,Proteome: genetics,Software,User-Computer Interface},
  eprinttype = {pmid},
  eprint = {18803844}
}

@article{MillerAssembly2010,
  title = {Assembly Algorithms for Next-Generation Sequencing Data.},
  volume = {95},
  issn = {1089-8646},
  doi = {10.1016/j.ygeno.2010.03.001},
  abstract = {The emergence of next-generation sequencing platforms led to resurgence of research in whole-genome shotgun assembly algorithms and software. DNA sequencing data from the Roche 454, Illumina/Solexa, and ABI SOLiD platforms typically present shorter read lengths, higher coverage, and different error profiles compared with Sanger sequencing data. Since 2005, several assembly software packages have been created or revised specifically for de novo assembly of next-generation sequencing data. This review summarizes and compares the published descriptions of packages named SSAKE, SHARCGS, VCAKE, Newbler, Celera Assembler, Euler, Velvet, ABySS, AllPaths, and SOAPdenovo. More generally, it compares the two standard methods known as the de Bruijn graph approach and the overlap/layout/consensus approach to assembly.},
  timestamp = {2016-06-16T16:08:02Z},
  number = {6},
  journaltitle = {Genomics},
  author = {Miller, Jason R and Koren, Sergey and Sutton, Granger},
  date = {2010-06},
  pages = {315--27},
  keywords = {Algorithms,DNA,DNA: methods,DNA: trends,Forecasting,Sequence Analysis,Software},
  eprinttype = {pmid},
  eprint = {20211242}
}

@article{MirarabSepp2012,
  title = {{{SEPP}}: {{SATé}}-{{Enabled Phylogenetic Placement}}.},
  issn = {1793-5091},
  abstract = {We address the problem of Phylogenetic Placement, in which the objective is to insert short molecular sequences (called query sequences) into an existing phylogenetic tree and alignment on full-length sequences for the same gene. Phylogenetic placement has the potential to provide information beyond pure "species identification" (i.e., the association of metagenomic reads to existing species), because it can also give information about the evolutionary relationships between these query sequences and to known species. Approaches for phylogenetic placement have been developed that operate in two steps: first, an alignment is estimated for each query sequence to the alignment of the full-length sequences, and then that alignment is used to find the optimal location in the phylogenetic tree for the query sequence. Recent methods of this type include HMMALIGN+EPA, HMMALIGN+pplacer, and PaPaRa+EPA.We report on a study evaluating phylogenetic placement methods on biological and simulated data. This study shows that these methods have extremely good accuracy and computational tractability under conditions where the input contains a highly accurate alignment and tree for the full-length sequences, and the set of full-length sequences is sufficiently small and not too evolutionarily diverse; however, we also show that under other conditions accuracy declines and the computational requirements for memory and time exceed acceptable limits. We present SEPP, a general "boosting" technique to improve the accuracy and/or speed of phylogenetic placement techniques. The key algorithmic aspect of this booster is a dataset decomposition technique in SATé, a method that utilizes an iterative divide-and-conquer technique to co-estimate alignments and trees on large molecular sequence datasets. We show that SATé-boosting improves HMMALIGN+pplacer, placing short sequences more accurately when the set of input sequences has a large evolutionary diameter and produces placements of comparable accuracy in a fraction of the time for easier cases. SEPP software and the datasets used in this study are all available for free at http://www.cs.utexas.edu/users/phylo/software/sepp/submission.},
  timestamp = {2016-06-16T16:08:02Z},
  journaltitle = {Pacific Symposium on Biocomputing. Pacific Symposium on Biocomputing},
  author = {Mirarab, S and Nguyen, N and Warnow, T},
  date = {2012-01},
  pages = {247--58},
  keywords = {metagenomic analysis,phylogenetic placement},
  eprinttype = {pmid},
  eprint = {22174280}
}

@article{MonzoorulhaqueSortitems2009,
  title = {{{SOrt}}-{{ITEMS}}: {{Sequence}} Orthology Based Approach for Improved Taxonomic Estimation of Metagenomic Sequences.},
  volume = {25},
  issn = {1367-4811},
  doi = {10.1093/bioinformatics/btp317},
  abstract = {MOTIVATION: One of the first steps in metagenomic analysis is the assignment of reads/contigs obtained from various sequencing technologies to their correct taxonomic bins. Similarity-based binning methods assign a read to a taxon/clade, based on the pattern of significant BLAST hits generated against sequence databases. Existing methods, which use bit-score as the sole parameter to ascertain the significance of BLAST hits, have limited specificity and accuracy of binning. A new binning algorithm, called SOrt-ITEMS is introduced, which addresses these limitations. The method uses alignment parameters besides the bit score to first identify an appropriate taxonomic level where the read can be assigned. An orthology-based approach is subsequently used by the method for the final assignment. RESULTS: The performance of SOrt-ITEMS has been validated with reads simulating sequences from 454 and Sanger sequencing technologies. In addition, the taxonomic composition of the Sargasso Sea data set has been analyzed using SOrt-ITEMS. SOrt-ITEMS shows improved specificity and accuracy of assignments especially in simulated scenarios, wherein sequences corresponding to the source organism of the reads are absent in the reference database. AVAILABILITY: SOrt-ITEMS software is available for download from: http://metagenomics.atc.tcs.com/binning/SOrt-ITEMS. No license is needed for academic and nonprofit use.},
  timestamp = {2016-06-16T16:08:03Z},
  number = {14},
  journaltitle = {Bioinformatics (Oxford, England)},
  author = {Monzoorul Haque, M and Ghosh, Tarini Shankar and Komanduri, Dinakar and Mande, Sharmila S},
  date = {2009-07},
  pages = {1722--30},
  keywords = {Algorithms,Base Sequence,Classification,Classification: methods,Databases,Genetic,Genomics,Genomics: methods,Phylogeny,Sequence Alignment,Software},
  eprinttype = {pmid},
  eprint = {19439565}
}

@article{NelsonCatalog2010,
  title = {A Catalog of Reference Genomes from the Human Microbiome.},
  volume = {328},
  issn = {1095-9203},
  doi = {10.1126/science.1183605},
  abstract = {The human microbiome refers to the community of microorganisms, including prokaryotes, viruses, and microbial eukaryotes, that populate the human body. The National Institutes of Health launched an initiative that focuses on describing the diversity of microbial species that are associated with health and disease. The first phase of this initiative includes the sequencing of hundreds of microbial reference genomes, coupled to metagenomic sequencing from multiple body sites. Here we present results from an initial reference genome sequencing of 178 microbial genomes. From 547,968 predicted polypeptides that correspond to the gene complement of these strains, previously unidentified ("novel") polypeptides that had both unmasked sequence length greater than 100 amino acids and no BLASTP match to any nonreference entry in the nonredundant subset were defined. This analysis resulted in a set of 30,867 polypeptides, of which 29,987 (approximately 97\%) were unique. In addition, this set of microbial genomes allows for approximately 40\% of random sequences from the microbiome of the gastrointestinal tract to be associated with organisms based on the match criteria used. Insights into pan-genome analysis suggest that we are still far from saturating microbial species genetic data sets. In addition, the associated metrics and standards used by our group for quality assurance are presented.},
  timestamp = {2016-06-16T16:08:06Z},
  number = {5981},
  journaltitle = {Science (New York, N.Y.)},
  author = {Nelson, Karen E and Weinstock, George M and Highlander, Sarah K and Worley, Kim C and Creasy, Heather Huot and Wortman, Jennifer Russo and Rusch, Douglas B and Mitreva, Makedonka and Sodergren, Erica and Chinwalla, Asif T and Feldgarden, Michael and Gevers, Dirk and Haas, Brian J and Madupu, Ramana and Ward, Doyle V and Birren, Bruce W and a Gibbs, Richard and Methe, Barbara and Petrosino, Joseph F and Strausberg, Robert L and Sutton, Granger G and White, Owen R and Wilson, Richard K and Durkin, Scott and Giglio, Michelle Gwinn and Gujja, Sharvari and Howarth, Clint and Kodira, Chinnappa D and Kyrpides, Nikos and Mehta, Teena and Muzny, Donna M and Pearson, Matthew and Pepin, Kymberlie and Pati, Amrita and Qin, Xiang and Yandava, Chandri and Zeng, Qiandong and Zhang, Lan and Berlin, Aaron M and Chen, Lei and a Hepburn, Theresa and Johnson, Justin and McCorrison, Jamison and Miller, Jason and Minx, Pat and Nusbaum, Chad and Russ, Carsten and Sykes, Sean M and Tomlinson, Chad M and Young, Sarah and Warren, Wesley C and Badger, Jonathan and Crabtree, Jonathan and Markowitz, Victor M and Orvis, Joshua and Cree, Andrew and Ferriera, Steve and Fulton, Lucinda L and Fulton, Robert S and Gillis, Marcus and Hemphill, Lisa D and Joshi, Vandita and Kovar, Christie and Torralba, Manolito and a Wetterstrand, Kris and Abouellleil, Amr and Wollam, Aye M and Buhay, Christian J and Ding, Yan and Dugan, Shannon and FitzGerald, Michael G and Holder, Mike and Hostetler, Jessica and Clifton, Sandra W and Allen-Vercoe, Emma and Earl, Ashlee M and Farmer, Candace N and Liolios, Konstantinos and Surette, Michael G and Xu, Qiang and Pohl, Craig and Wilczek-Boney, Katarzyna and Zhu, Dianhui},
  date = {2010-05},
  pages = {994--9},
  keywords = {Archaeal,Bacteria,Bacteria: classification,Bacteria: genetics,Bacterial,Bacterial Proteins,Bacterial Proteins: chemistry,Bacterial Proteins: genetics,Biodiversity,Computational Biology,Databases,DNA,DNA: standards,Gastrointestinal Tract,Gastrointestinal Tract: microbiology,Genes,Genetic,Genetic Variation,Genome,Humans,Metagenome,Metagenome: genetics,metagenomics,Metagenomics: methods,Metagenomics: standards,Mouth,Mouth: microbiology,Peptides,Peptides: chemistry,Peptides: genetics,Phylogeny,Respiratory System,Respiratory System: microbiology,Sequence Analysis,Skin,Skin: microbiology,Urogenital System,Urogenital System: microbiology},
  eprinttype = {pmid},
  eprint = {20489017}
}

@article{PatilTaxonomic2011,
  title = {Taxonomic Metagenome Sequence Assignment with Structured Output Models},
  volume = {8},
  issn = {1548-7091},
  doi = {10.1038/nmeth0311-191},
  timestamp = {2016-06-16T16:08:11Z},
  number = {3},
  journaltitle = {Nature Methods},
  author = {Patil, Kaustubh R and Haider, Peter and Pope, Phillip B and Turnbaugh, Peter J and Morrison, Mark and Scheffer, Tobias and McHardy, Alice Carolyn},
  date = {2011-03},
  pages = {191--192}
}

@article{PellScaling2012,
  title = {Scaling Metagenome Sequence Assembly with Probabilistic de {{Bruijn}} Graphs},
  volume = {I},
  doi = {10.1073/pnas.1121464109},
  abstract = {The memory requirements for de novo assembly of short-read shotgun sequencing data from complex microbial populations are an increasingly large practical barrier to environmental studies. Here we introduce a memory-efficient graph representation with which we can analyze the k-mer connectivity of metagenomic samples, allowing us to reduce the size of the de novo assembly process for metagenomes with a "divide and conquer" algorithm. This graph representation is based on a probabilistic data structure, a Bloom filter, that allows us to store assembly graphs in as little as 4 bits per k-mer. We use this approach to achieve a 20-fold decrease in memory for the assembly of a soil metagenome sample.},
  timestamp = {2017-02-21T14:57:39Z},
  number = {1},
  journaltitle = {Arxiv preprint arXiv:1112.4193},
  author = {Pell, Jason and Hintze, Arend and Canino-Koning, Rosangela and Howe, Adina and Tiedje, J.M. and Brown, C.T.},
  date = {2012-06-30},
  pages = {1--11}
}

@article{PengMetaidba2011,
  title = {Meta-{{IDBA}}: A de Novo Assembler for Metagenomic Data.},
  volume = {27},
  issn = {1367-4811},
  doi = {10.1093/bioinformatics/btr216},
  abstract = {Next-generation sequencing techniques allow us to generate reads from a microbial environment in order to analyze the microbial community. However, assembling of a set of mixed reads from different species to form contigs is a bottleneck of metagenomic research. Although there are many assemblers for assembling reads from a single genome, there are no assemblers for assembling reads in metagenomic data without reference genome sequences. Moreover, the performances of these assemblers on metagenomic data are far from satisfactory, because of the existence of common regions in the genomes of subspecies and species, which make the assembly problem much more complicated.},
  timestamp = {2016-06-16T16:08:12Z},
  number = {13},
  journaltitle = {Bioinformatics (Oxford, England)},
  author = {Peng, Yu and Leung, Henry C M and Yiu, S M and Chin, Francis Y L},
  date = {2011-07},
  pages = {i94--i101},
  eprinttype = {pmid},
  eprint = {21685107}
}

@article{PopeAdaptation2010,
  title = {Adaptation to Herbivory by the {{Tammar}} Wallaby Includes Bacterial and Glycoside Hydrolase Profiles Different from Other Herbivores.},
  volume = {107},
  issn = {1091-6490},
  doi = {10.1073/pnas.1005297107},
  abstract = {Metagenomic and bioinformatic approaches were used to characterize plant biomass conversion within the foregut microbiome of Australia's "model" marsupial, the Tammar wallaby (Macropus eugenii). Like the termite hindgut and bovine rumen, key enzymes and modular structures characteristic of the "free enzyme" and "cellulosome" paradigms of cellulose solubilization remain either poorly represented or elusive to capture by shotgun sequencing methods. Instead, multigene polysaccharide utilization loci-like systems coupled with genes encoding beta-1,4-endoglucanases and beta-1,4-endoxylanases–which have not been previously encountered in metagenomic datasets–were identified, as were a diverse set of glycoside hydrolases targeting noncellulosic polysaccharides. Furthermore, both rrs gene and other phylogenetic analyses confirmed that unique clades of the Lachnospiraceae, Bacteroidales, and Gammaproteobacteria are predominant in the Tammar foregut microbiome. Nucleotide composition-based sequence binning facilitated the assemblage of more than two megabase pairs of genomic sequence for one of the novel Lachnospiraceae clades (WG-2). These analyses show that WG-2 possesses numerous glycoside hydrolases targeting noncellulosic polysaccharides. These collective data demonstrate that Australian macropods not only harbor unique bacterial lineages underpinning plant biomass conversion, but their repertoire of glycoside hydrolases is distinct from those of the microbiomes of higher termites and the bovine rumen.},
  timestamp = {2016-06-16T16:08:13Z},
  number = {33},
  journaltitle = {Proceedings of the National Academy of Sciences of the United States of America},
  author = {Pope, P B and Denman, S E and Jones, M and Tringe, S G and Barry, K and a Malfatti, S and McHardy, a C and Cheng, J-F and Hugenholtz, P and McSweeney, C S and Morrison, M},
  date = {2010-08},
  pages = {14793--8},
  keywords = {16S,16S: genetics,Adaptation,Animals,Bacteria,Bacteria: classification,Bacteria: genetics,Bacteria: metabolism,Cellulosomes,Cellulosomes: metabolism,DNA,Gastrointestinal Tract,Gastrointestinal Tract: microbiology,Glycoside Hydrolases,Glycoside Hydrolases: classification,Glycoside Hydrolases: genetics,Glycoside Hydrolases: metabolism,Macropodidae,Macropodidae: genetics,Macropodidae: microbiology,Macropodidae: physiology,Metagenome,Metagenome: genetics,metagenomics,Metagenomics: methods,Molecular Sequence Data,Phylogeny,Physiological,Physiological: genetics,Physiological: physiology,Plants,Plants: metabolism,Ribosomal,RNA,Seasons,Sequence Analysis},
  eprinttype = {pmid},
  eprint = {20668243}
}

@article{PopeIsolation2011,
  title = {Isolation of {{Succinivibrionaceae}} Implicated in Low Methane Emissions from {{Tammar}} Wallabies.},
  volume = {333},
  issn = {1095-9203},
  doi = {10.1126/science.1205760},
  abstract = {The Tammar wallaby (Macropus eugenii) harbors unique gut bacteria and produces only one-fifth the amount of methane produced by ruminants per unit of digestible energy intake. We have isolated a dominant bacterial species (WG-1) from the wallaby microbiota affiliated with the family Succinivibrionaceae and implicated in lower methane emissions from starch-containing diets. This was achieved by using a partial reconstruction of the bacterium's metabolism from binned metagenomic data (nitrogen and carbohydrate utilization pathways and antibiotic resistance) to devise cultivation-based strategies that produced axenic WG-1 cultures. Pure-culture studies confirm that the bacterium is capnophilic and produces succinate, further explaining a microbiological basis for lower methane emissions from macropodids. This knowledge also provides new strategic targets for redirecting fermentation and reducing methane production in livestock.},
  timestamp = {2016-06-16T16:08:14Z},
  number = {6042},
  journaltitle = {Science (New York, N.Y.)},
  author = {Pope, P B and Smith, W and Denman, S E and Tringe, S G and Barry, K and Hugenholtz, P and McSweeney, C S and McHardy, a C and Morrison, M},
  date = {2011-07},
  pages = {646--8},
  keywords = {Animals,Bacterial,Carbohydrate Metabolism,Digestive System,Digestive System: microbiology,Female,Fermentation,Genome,Macropodidae,Macropodidae: microbiology,Metagenome,Methane,Methane: metabolism,Molecular Sequence Data,Starch,Starch: metabolism,Succinic Acid,Succinic Acid: metabolism,Succinivibrionaceae,Succinivibrionaceae: genetics,Succinivibrionaceae: growth & development,Succinivibrionaceae: isolation & purification,Succinivibrionaceae: metabolism},
  eprinttype = {pmid},
  eprint = {21719642}
}

@article{PrideGenome2008,
  title = {Genome Signature Analysis of Thermal Virus Metagenomes Reveals {{Archaea}} and Thermophilic Signatures.},
  volume = {9},
  issn = {1471-2164},
  doi = {10.1186/1471-2164-9-420},
  abstract = {Metagenomic analysis provides a rich source of biological information for otherwise intractable viral communities. However, study of viral metagenomes has been hampered by its nearly complete reliance on BLAST algorithms for identification of DNA sequences. We sought to develop algorithms for examination of viral metagenomes to identify the origin of sequences independent of BLAST algorithms. We chose viral metagenomes obtained from two hot springs, Bear Paw and Octopus, in Yellowstone National Park, as they represent simple microbial populations where comparatively large contigs were obtained. Thermal spring metagenomes have high proportions of sequences without significant Genbank homology, which has hampered identification of viruses and their linkage with hosts. To analyze each metagenome, we developed a method to classify DNA fragments using genome signature-based phylogenetic classification (GSPC), where metagenomic fragments are compared to a database of oligonucleotide signatures for all previously sequenced Bacteria, Archaea, and viruses.},
  timestamp = {2016-06-16T16:08:16Z},
  journaltitle = {BMC genomics},
  author = {Pride, David T and Schoenfeld, Thomas},
  date = {2008-01},
  pages = {420},
  keywords = {Archaea,Archaeal Viruses,Archaeal Viruses: classification,Archaeal Viruses: genetics,Archaea: virology,Genomics,Genomics: methods,Hot Springs,Hot Springs: virology,Northwestern United States},
  eprinttype = {pmid},
  eprint = {18798991}
}

@article{PruesseSilva2007,
  title = {{{SILVA}}: A Comprehensive Online Resource for Quality Checked and Aligned Ribosomal {{RNA}} Sequence Data Compatible with {{ARB}}.},
  volume = {35},
  issn = {1362-4962},
  doi = {10.1093/nar/gkm864},
  abstract = {Sequencing ribosomal RNA (rRNA) genes is currently the method of choice for phylogenetic reconstruction, nucleic acid based detection and quantification of microbial diversity. The ARB software suite with its corresponding rRNA datasets has been accepted by researchers worldwide as a standard tool for large scale rRNA analysis. However, the rapid increase of publicly available rRNA sequence data has recently hampered the maintenance of comprehensive and curated rRNA knowledge databases. A new system, SILVA (from Latin silva, forest), was implemented to provide a central comprehensive web resource for up to date, quality controlled databases of aligned rRNA sequences from the Bacteria, Archaea and Eukarya domains. All sequences are checked for anomalies, carry a rich set of sequence associated contextual information, have multiple taxonomic classifications, and the latest validly described nomenclature. Furthermore, two precompiled sequence datasets compatible with ARB are offered for download on the SILVA website: (i) the reference (Ref) datasets, comprising only high quality, nearly full length sequences suitable for in-depth phylogenetic analysis and probe design and (ii) the comprehensive Parc datasets with all publicly available rRNA sequences longer than 300 nucleotides suitable for biodiversity analyses. The latest publicly available database release 91 (August 2007) hosts 547 521 sequences split into 461 823 small subunit and 85 689 large subunit rRNAs.},
  timestamp = {2016-06-16T16:08:16Z},
  number = {21},
  journaltitle = {Nucleic acids research},
  author = {Pruesse, Elmar and Quast, Christian and Knittel, Katrin and Fuchs, Bernhard M and Ludwig, Wolfgang and Peplies, Jörg and Glöckner, Frank Oliver},
  date = {2007-01},
  pages = {7188--96},
  keywords = {Base Sequence,Databases,Genes,Internet,Nucleic Acid,Nucleic Acid: standards,Phylogeny,Quality Control,Ribosomal,Ribosomal: genetics,RNA,rRNA,Sequence Alignment,Sequence Analysis,Software},
  eprinttype = {pmid},
  eprint = {17947321}
}

@article{QinHuman2010,
  title = {A Human Gut Microbial Gene Catalogue Established by Metagenomic Sequencing.},
  volume = {464},
  issn = {1476-4687},
  doi = {10.1038/nature08821},
  abstract = {To understand the impact of gut microbes on human health and well-being it is crucial to assess their genetic potential. Here we describe the Illumina-based metagenomic sequencing, assembly and characterization of 3.3 million non-redundant microbial genes, derived from 576.7 gigabases of sequence, from faecal samples of 124 European individuals. The gene set, approximately 150 times larger than the human gene complement, contains an overwhelming majority of the prevalent (more frequent) microbial genes of the cohort and probably includes a large proportion of the prevalent human intestinal microbial genes. The genes are largely shared among individuals of the cohort. Over 99\% of the genes are bacterial, indicating that the entire cohort harbours between 1,000 and 1,150 prevalent bacterial species and each individual at least 160 such species, which are also largely shared. We define and describe the minimal gut metagenome and the minimal gut bacterial genome in terms of functions present in all individuals and most bacteria, respectively.},
  timestamp = {2016-06-16T16:08:16Z},
  number = {7285},
  journaltitle = {Nature},
  author = {Qin, Junjie and Li, Ruiqiang and Raes, Jeroen and Arumugam, Manimozhiyan and Burgdorf, Kristoffer Solvsten and Manichanh, Chaysavanh and Nielsen, Trine and Pons, Nicolas and Levenez, Florence and Yamada, Takuji and Mende, Daniel R and Li, Junhua and Xu, Junming and Li, Shaochuan and Li, Dongfang and Cao, Jianjun and Wang, Bo and Liang, Huiqing and Zheng, Huisong and Xie, Yinlong and Tap, Julien and Lepage, Patricia and Bertalan, Marcelo and Batto, Jean-Michel and Hansen, Torben and Le Paslier, Denis and Linneberg, Allan and Nielsen, H Bjørn and Pelletier, Eric and Renault, Pierre and Sicheritz-Ponten, Thomas and Turner, Keith and Zhu, Hongmei and Yu, Chang and Li, Shengting and Jian, Min and Zhou, Yan and Li, Yingrui and Zhang, Xiuqing and Li, Songgang and Qin, Nan and Yang, Huanming and Wang, Jian and Brunak, Søren and Doré, Joel and Guarner, Francisco and Kristiansen, Karsten and Pedersen, Oluf and Parkhill, Julian and Weissenbach, Jean and Bork, Peer and Ehrlich, S Dusko and Wang, Jun},
  date = {2010-03},
  pages = {59--65},
  keywords = {Adult,Bacteria,Bacteria: classification,Bacteria: genetics,Bacteria: isolation & purification,Bacterial,Bacterial: genetics,Bacteria: metabolism,Cohort Studies,Contig Mapping,Denmark,DNA,Essential,Essential: genetics,Feces,Feces: microbiology,Gastrointestinal Tract,Gastrointestinal Tract: microbiology,Genes,Genome,Genomics,Health,Humans,Inflammatory Bowel Diseases,Inflammatory Bowel Diseases: genetics,Metagenome,Metagenome: genetics,Obesity,Obesity: genetics,Open Reading Frames,Open Reading Frames: genetics,Overweight,Overweight: genetics,Sequence Analysis,Spain},
  eprinttype = {pmid},
  eprint = {20203603}
}

@article{QuinceRational2008,
  title = {The Rational Exploration of Microbial Diversity.},
  volume = {2},
  issn = {1751-7370},
  doi = {10.1038/ismej.2008.69},
  abstract = {The exploration of the microbial world has been an exciting series of unanticipated discoveries despite being largely uninformed by rational estimates of the magnitude of task confronting us. However, in the long term, more structured surveys can be achieved by estimating the diversity of microbial communities and the effort required to describe them. The rates of recovery of new microbial taxa in very large samples suggest that many more taxa remain to be discovered in soils and the oceans. We apply a robust statistical method to large gene sequence libraries from these environments to estimate both diversity and the sequencing effort required to obtain a given fraction of that diversity. In the upper ocean, we predict some 1400 phylotypes, and a mere fivefold increase in shotgun reads could yield 90\% of the metagenome, that is, all genes from all taxa. However, at deep ocean, hydrothermal vents and diversities in soils can be up to two orders of magnitude larger, and hundreds of times the current number of samples will be required just to obtain 90\% of the taxonomic diversity based on 3\% difference in 16S rDNA. Obtaining 90\% of the metagenome will require tens of thousands of times the current sequencing effort. Although the definitive sequencing of hyperdiverse environments is not yet possible, we can, using taxa-abundance distributions, begin to plan and develop the required methods and strategies. This would initiate a new phase in the exploration of the microbial world.},
  timestamp = {2016-06-16T16:08:17Z},
  number = {10},
  journaltitle = {The ISME journal},
  author = {Quince, Christopher and Curtis, Thomas P and Sloan, William T},
  date = {2008-10},
  pages = {997--1006},
  keywords = {16S,16S: genetics,Bacteria,Bacteria: genetics,Bacteria: isolation & purification,Bacterial,Bacterial: genetics,Biodiversity,DNA,Genome,Models,Ribosomal,Ribosomal: genetics,RNA,Seawater,Seawater: microbiology,Sequence Analysis,Soil Microbiology,Statistical},
  eprinttype = {pmid},
  eprint = {18650928}
}

@article{QuinceAccurate2009,
  title = {Accurate Determination of Microbial Diversity from 454 Pyrosequencing Data.},
  volume = {6},
  issn = {1548-7105},
  doi = {10.1038/nmeth.1361},
  abstract = {We present an algorithm, PyroNoise, that clusters the flowgrams of 454 pyrosequencing reads using a distance measure that models sequencing noise. This infers the true sequences in a collection of amplicons. We pyrosequenced a known mixture of microbial 16S rDNA sequences extracted from a lake and found that without noise reduction the number of operational taxonomic units is overestimated but using PyroNoise it can be accurately calculated.},
  timestamp = {2016-06-16T16:08:17Z},
  number = {9},
  journaltitle = {Nature methods},
  author = {Quince, Christopher and Lanzén, Anders and Curtis, Thomas P and Davenport, Russell J and Hall, Neil and Head, Ian M and Read, L Fiona and Sloan, William T},
  date = {2009-09},
  pages = {639--41},
  keywords = {16S,16S: genetics,Algorithms,Bacteria,Bacteria: genetics,Bacterial,Bacterial: genetics,DNA,DNA: methods,Genetic,Genetic Variation,Models,Polymerase Chain Reaction,Ribosomal,RNA,Sequence Analysis,Software},
  eprinttype = {pmid},
  eprint = {19668203}
}

@article{RiesenfeldMetagenomics2004,
  title = {Metagenomics: Genomic Analysis of Microbial Communities.},
  volume = {38},
  issn = {0066-4197},
  doi = {10.1146/annurev.genet.38.072902.091216},
  abstract = {Uncultured microorganisms comprise the majority of the planet's biological diversity. Microorganisms represent two of the three domains of life and contain vast diversity that is the product of an estimated 3.8 billion years of evolution. In many environments, as many as 99\% of the microorganisms cannot be cultured by standard techniques, and the uncultured fraction includes diverse organisms that are only distantly related to the cultured ones. Therefore, culture-independent methods are essential to understand the genetic diversity, population structure, and ecological roles of the majority of microorganisms. Metagenomics, or the culture-independent genomic analysis of an assemblage of microorganisms, has potential to answer fundamental questions in microbial ecology. This review describes progress toward understanding the biology of uncultured Bacteria, Archaea, and viruses through metagenomic analyses.},
  timestamp = {2016-06-16T16:08:18Z},
  journaltitle = {Annual review of genetics},
  author = {Riesenfeld, Christian S and Schloss, Patrick D and Handelsman, Jo},
  date = {2004-01},
  pages = {525--52},
  keywords = {Archaea,Archaea: genetics,Archaeal,Bacteria,Bacteria: genetics,Bacterial,Bacteriophages,Bacteriophages: genetics,Biodiversity,Environmental Microbiology,Gene Library,Genome,Genomic Library,Microscopy,Phylogeny,Rhodopsin,Rhodopsin: genetics,Sequence Analysis},
  eprinttype = {pmid},
  eprint = {15568985}
}

@article{RosenNbc2011,
  title = {{{NBC}}: The {{Naive Bayes Classification}} Tool Webserver for Taxonomic Classification of Metagenomic Reads.},
  volume = {27},
  issn = {1367-4811},
  doi = {10.1093/bioinformatics/btq619},
  abstract = {Datasets from high-throughput sequencing technologies have yielded a vast amount of data about organisms in environmental samples. Yet, it is still a challenge to assess the exact organism content in these samples because the task of taxonomic classification is too computationally complex to annotate all reads in a dataset. An easy-to-use webserver is needed to process these reads. While many methods exist, only a few are publicly available on webservers, and out of those, most do not annotate all reads.},
  timestamp = {2016-06-16T16:08:21Z},
  number = {1},
  journaltitle = {Bioinformatics (Oxford, England)},
  author = {Rosen, Gail L and Reichenberger, Erin R and Rosenfeld, Aaron M},
  date = {2011-01},
  pages = {127--9},
  keywords = {Algorithms,Bayes Theorem,High-Throughput Nucleotide Sequencing,Internet,metagenomics,Metagenomics: methods,Phylogeny,Software},
  eprinttype = {pmid},
  eprint = {21062764}
}

@article{SayersDatabase2009,
  title = {Database Resources of the {{National Center}} for {{Biotechnology Information}}.},
  volume = {37},
  issn = {1362-4962},
  doi = {10.1093/nar/gkn741},
  abstract = {In addition to maintaining the GenBank nucleic acid sequence database, the National Center for Biotechnology Information (NCBI) provides analysis and retrieval resources for the data in GenBank and other biological data made available through the NCBI web site. NCBI resources include Entrez, the Entrez Programming Utilities, MyNCBI, PubMed, PubMed Central, Entrez Gene, the NCBI Taxonomy Browser, BLAST, BLAST Link (BLink), Electronic PCR, OrfFinder, Spidey, Splign, RefSeq, UniGene, HomoloGene, ProtEST, dbMHC, dbSNP, Cancer Chromosomes, Entrez Genomes and related tools, the Map Viewer, Model Maker, Evidence Viewer, Clusters of Orthologous Groups (COGs), Retroviral Genotyping Tools, HIV-1/Human Protein Interaction Database, Gene Expression Omnibus (GEO), Entrez Probe, GENSAT, Online Mendelian Inheritance in Man (OMIM), Online Mendelian Inheritance in Animals (OMIA), the Molecular Modeling Database (MMDB), the Conserved Domain Database (CDD), the Conserved Domain Architecture Retrieval Tool (CDART) and the PubChem suite of small molecule databases. Augmenting many of the web applications is custom implementation of the BLAST program optimized to search specialized data sets. All of the resources can be accessed through the NCBI home page at www.ncbi.nlm.nih.gov.},
  timestamp = {2016-06-16T16:08:23Z},
  issue = {Database issue},
  journaltitle = {Nucleic acids research},
  author = {Sayers, Eric W and Barrett, Tanya and Benson, Dennis and Bryant, Stephen H and Canese, Kathi and Chetvernin, Vyacheslav and Church, Deanna M and DiCuccio, Michael and Edgar, Ron and Federhen, Scott and Feolo, Michael and Geer, Lewis Y and Helmberg, Wolfgang and Kapustin, Yuri and Landsman, David and Lipman, David J and Madden, Thomas L and Maglott, Donna R and Miller, Vadim and Mizrachi, Ilene and Ostell, James and Pruitt, Kim D and Schuler, Gregory D and Sequeira, Edwin and Sherry, Stephen T and Shumway, Martin and Sirotkin, Karl and Souvorov, Alexandre and Starchenko, Grigory and a Tatusova, Tatiana and Wagner, Lukas and Yaschenko, Eugene and Ye, Jian},
  date = {2009-01},
  pages = {D5--15},
  keywords = {Databases,Gene Expression,Genes,Genetic,Genomics,Genotype,National Library of Medicine (U.S.),Phenotype,Protein Structure,Proteomics,PubMed,Sequence Homology,Systems Integration,Tertiary,United States},
  eprinttype = {pmid},
  eprint = {18940862}
}

@article{SchadtWindow2010,
  title = {A Window into Third-Generation Sequencing.},
  volume = {19},
  issn = {1460-2083},
  doi = {10.1093/hmg/ddq416},
  abstract = {First- and second-generation sequencing technologies have led the way in revolutionizing the field of genomics and beyond, motivating an astonishing number of scientific advances, including enabling a more complete understanding of whole genome sequences and the information encoded therein, a more complete characterization of the methylome and transcriptome and a better understanding of interactions between proteins and DNA. Nevertheless, there are sequencing applications and aspects of genome biology that are presently beyond the reach of current sequencing technologies, leaving fertile ground for additional innovation in this space. In this review, we describe a new generation of single-molecule sequencing technologies (third-generation sequencing) that is emerging to fill this space, with the potential for dramatically longer read lengths, shorter time to result and lower overall cost.},
  timestamp = {2016-06-16T16:08:23Z},
  issue = {R2},
  journaltitle = {Human molecular genetics},
  author = {Schadt, Eric E and Turner, Steve and Kasarskis, Andrew},
  date = {2010-10},
  pages = {R227--40},
  keywords = {Biological,DNA,DNA: methods,High-Throughput Nucleotide Sequencing,High-Throughput Nucleotide Sequencing: methods,Humans,Models,Sequence Analysis},
  eprinttype = {pmid},
  eprint = {20858600}
}

@article{SchloissnigGenomic2013,
  title = {Genomic Variation Landscape of the Human Gut Microbiome.},
  volume = {493},
  issn = {1476-4687},
  doi = {10.1038/nature11711},
  abstract = {Whereas large-scale efforts have rapidly advanced the understanding and practical impact of human genomic variation, the practical impact of variation is largely unexplored in the human microbiome. We therefore developed a framework for metagenomic variation analysis and applied it to 252 faecal metagenomes of 207 individuals from Europe and North America. Using 7.4 billion reads aligned to 101 reference species, we detected 10.3 million single nucleotide polymorphisms (SNPs), 107,991 short insertions/deletions, and 1,051 structural variants. The average ratio of non-synonymous to synonymous polymorphism rates of 0.11 was more variable between gut microbial species than across human hosts. Subjects sampled at varying time intervals exhibited individuality and temporal stability of SNP variation patterns, despite considerable composition changes of their gut microbiota. This indicates that individual-specific strains are not easily replaced and that an individual might have a unique metagenomic genotype, which may be exploitable for personalized diet or drug intake.},
  timestamp = {2016-06-16T16:08:24Z},
  number = {7430},
  journaltitle = {Nature},
  author = {Schloissnig, Siegfried and Arumugam, Manimozhiyan and Sunagawa, Shinichi and Mitreva, Makedonka and Tap, Julien and Zhu, Ana and Waller, Alison and Mende, Daniel R and Kultima, Jens Roat and Martin, John and Kota, Karthik and Sunyaev, Shamil R and Weinstock, George M and Bork, Peer},
  date = {2013-01},
  pages = {45--50},
  keywords = {Bacterial,Bacterial: genetics,Europe,Feces,Feces: microbiology,Genetic Variation,Genetic Variation: genetics,Genome,Genotype,Geographic Mapping,Humans,Intestines,Intestines: microbiology,Metagenome,Metagenome: genetics,North America,Polymorphism,Reference Standards,Single Nucleotide,Single Nucleotide: genetics,Time Factors},
  eprinttype = {pmid},
  eprint = {23222524}
}

@article{SchlossCensus2006,
  title = {Toward a Census of Bacteria in Soil.},
  volume = {2},
  issn = {1553-7358},
  doi = {10.1371/journal.pcbi.0020092},
  abstract = {For more than a century, microbiologists have sought to determine the species richness of bacteria in soil, but the extreme complexity and unknown structure of soil microbial communities have obscured the answer. We developed a statistical model that makes the problem of estimating richness statistically accessible by evaluating the characteristics of samples drawn from simulated communities with parametric community distributions. We identified simulated communities with rank-abundance distributions that followed a truncated lognormal distribution whose samples resembled the structure of 16S rRNA gene sequence collections made using Alaskan and Minnesotan soils. The simulated communities constructed based on the distribution of 16S rRNA gene sequences sampled from the Alaskan and Minnesotan soils had a richness of 5,000 and 2,000 operational taxonomic units (OTUs), respectively, where an OTU represents a collection of sequences not more than 3\% distant from each other. To sample each of these OTUs in the Alaskan 16S rRNA gene library at least twice, 480,000 sequences would be required; however, to estimate the richness of the simulated communities using nonparametric richness estimators would require only 18,000 sequences. Quantifying the richness of complex environments such as soil is an important step in building an ecological framework. We have shown that generating sufficient sequence data to do so requires less sequencing effort than completely sequencing a bacterial genome.},
  timestamp = {2016-06-16T16:08:26Z},
  number = {7},
  journaltitle = {PLoS computational biology},
  author = {Schloss, Patrick D and Handelsman, Jo},
  date = {2006-07},
  pages = {e92},
  keywords = {16S,16S: genetics,Alaska,Bacteria,Bacteria: genetics,Bacteria: isolation & purification,Bacterial,Bacterial: genetics,Colony Count,Gene Library,Microbial,Minnesota,Models,Ribosomal,RNA,Soil Microbiology,Statistical},
  eprinttype = {pmid},
  eprint = {16848637}
}

@article{SegataMetagenomic2012,
  title = {Metagenomic Microbial Community Profiling Using Unique Clade-Specific Marker Genes.},
  issn = {1548-7105},
  doi = {10.1038/nmeth.2066},
  abstract = {Metagenomic shotgun sequencing data can identify microbes populating a microbial community and their proportions, but existing taxonomic profiling methods are inefficient for increasingly large data sets. We present an approach that uses clade-specific marker genes to unambiguously assign reads to microbial clades more accurately and $\backslash$textgreater50× faster than current approaches. We validated our metagenomic phylogenetic analysis tool, MetaPhlAn, on terabases of short reads and provide the largest metagenomic profiling to date of the human gut. It can be accessed at http://huttenhower.sph.harvard.edu/metaphlan/.},
  timestamp = {2016-06-16T16:08:27Z},
  issue = {JuNe},
  journaltitle = {Nature methods},
  author = {Segata, Nicola and Waldron, Levi and Ballarini, Annalisa and Narasimhan, Vagheesh and Jousson, Olivier and Huttenhower, Curtis},
  date = {2012-06},
  pages = {1--7},
  eprinttype = {pmid},
  eprint = {22688413}
}

@article{SilvaFocus2014,
  title = {{{FOCUS}}: An Alignment-Free Model to Identify Organisms in Metagenomes Using Non-Negative Least Squares.},
  volume = {2},
  issn = {2167-8359},
  doi = {10.7717/peerj.425},
  abstract = {One of the major goals in metagenomics is to identify the organisms present in a microbial community from unannotated shotgun sequencing reads. Taxonomic profiling has valuable applications in biological and medical research, including disease diagnostics. Most currently available approaches do not scale well with increasing data volumes, which is important because both the number and lengths of the reads provided by sequencing platforms keep increasing. Here we introduce FOCUS, an agile composition based approach using non-negative least squares (NNLS) to report the organisms present in metagenomic samples and profile their abundances. FOCUS was tested with simulated and real metagenomes, and the results show that our approach accurately predicts the organisms present in microbial communities. FOCUS was implemented in Python. The source code and web-sever are freely available at http://edwards.sdsu.edu/FOCUS.},
  timestamp = {2016-06-16T16:08:30Z},
  journaltitle = {PeerJ},
  author = {Silva, Genivaldo Gueiros Z and a Cuevas, Daniel and Dutilh, Bas E and a Edwards, Robert},
  date = {2014-01},
  pages = {e425},
  eprinttype = {pmid},
  eprint = {24949242}
}

@article{StarkMltreemap2010,
  title = {{{MLTreeMap}} - Accurate Maximum Likelihood Placement of Environmental {{DNA}} Sequences into Taxonomic and Functional Reference Phylogenies.},
  volume = {11},
  issn = {1471-2164},
  doi = {10.1186/1471-2164-11-461},
  abstract = {Shotgun sequencing of environmental DNA is an essential technique for characterizing uncultivated microbes in situ. However, the taxonomic and functional assignment of the obtained sequence fragments remains a pressing problem.},
  timestamp = {2016-06-16T16:08:32Z},
  journaltitle = {BMC genomics},
  author = {Stark, Manuel and Berger, Simon and Stamatakis, Alexandros and von Mering, Christian},
  date = {2010-01},
  pages = {461},
  keywords = {Algorithms,DNA,DNA: analysis,DNA: classification,DNA: genetics,DNA: methods,Internet,Likelihood Functions,phylogenetic placement algorithm,Phylogeny,Sequence Analysis,Software Design,taxonomic binning},
  options = {useprefix=true},
  eprinttype = {pmid},
  eprint = {20687950}
}

@article{SuMetaabc2011,
  title = {{{MetaABC}}–an Integrated Metagenomics Platform for Data Adjustment, Binning and Clustering.},
  volume = {27},
  issn = {1367-4811},
  doi = {10.1093/bioinformatics/btr376},
  abstract = {MetaABC is a metagenomic platform that integrates several binning tools coupled with methods for removing artifacts, analyzing unassigned reads and controlling sampling biases. It allows users to arrive at a better interpretation via series of distinct combinations of analysis tools. After execution, MetaABC provides outputs in various visual formats such as tables, pie and bar charts as well as clustering result diagrams.},
  timestamp = {2016-06-16T16:08:34Z},
  number = {16},
  journaltitle = {Bioinformatics (Oxford, England)},
  author = {Su, Chien-Hao and Hsu, Ming-Tsung and Wang, Tse-Yi and Chiang, Sufeng and Cheng, Jen-Hao and Weng, Francis C and Kao, Cheng-Yan and Wang, Daryi and Tsai, Huai-Kuang},
  date = {2011-08},
  pages = {2298--9},
  keywords = {Cluster Analysis,metagenomics,Metagenomics: methods,Software,Systems Integration},
  eprinttype = {pmid},
  eprint = {21697124}
}

@article{SuenInsect2010,
  title = {An Insect Herbivore Microbiome with High Plant Biomass-Degrading Capacity.},
  volume = {6},
  issn = {1553-7404},
  doi = {10.1371/journal.pgen.1001129},
  abstract = {Herbivores can gain indirect access to recalcitrant carbon present in plant cell walls through symbiotic associations with lignocellulolytic microbes. A paradigmatic example is the leaf-cutter ant (Tribe: Attini), which uses fresh leaves to cultivate a fungus for food in specialized gardens. Using a combination of sugar composition analyses, metagenomics, and whole-genome sequencing, we reveal that the fungus garden microbiome of leaf-cutter ants is composed of a diverse community of bacteria with high plant biomass-degrading capacity. Comparison of this microbiome's predicted carbohydrate-degrading enzyme profile with other metagenomes shows closest similarity to the bovine rumen, indicating evolutionary convergence of plant biomass degrading potential between two important herbivorous animals. Genomic and physiological characterization of two dominant bacteria in the fungus garden microbiome provides evidence of their capacity to degrade cellulose. Given the recent interest in cellulosic biofuels, understanding how large-scale and rapid plant biomass degradation occurs in a highly evolved insect herbivore is of particular relevance for bioenergy.},
  timestamp = {2016-06-16T16:08:34Z},
  number = {9},
  journaltitle = {PLoS genetics},
  author = {Suen, Garret and Scott, Jarrod J and Aylward, Frank O and Adams, Sandra M and Tringe, Susannah G and a Pinto-Tomás, Adrián and Foster, Clifton E and Pauly, Markus and Weimer, Paul J and Barry, Kerrie W and a Goodwin, Lynne and Bouffard, Pascal and Li, Lewyn and Osterberger, Jolene and Harkins, Timothy T and Slater, Steven C and Donohue, Timothy J and Currie, Cameron R},
  date = {2010-09},
  pages = {e1001129},
  keywords = {Animals,Ants,Ants: microbiology,Biomass,Biopolymers,Biopolymers: metabolism,Carbohydrate Metabolism,Carbohydrate Metabolism: genetics,Cattle,Cluster Analysis,Feeding Behavior,Feeding Behavior: physiology,Fungi,Fungi: genetics,Metagenome,Metagenome: genetics,Molecular Sequence Data,Phylogeny,Plant Leaves,Plant Leaves: metabolism},
  eprinttype = {pmid},
  eprint = {20885794}
}

@article{SunagawaMetagenomic2013,
  title = {Metagenomic Species Profiling Using Universal Phylogenetic Marker Genes.},
  volume = {10},
  issn = {1548-7105},
  doi = {10.1038/nmeth.2693},
  abstract = {To quantify known and unknown microorganisms at species-level resolution using shotgun sequencing data, we developed a method that establishes metagenomic operational taxonomic units (mOTUs) based on single-copy phylogenetic marker genes. Applied to 252 human fecal samples, the method revealed that on average 43\% of the species abundance and 58\% of the richness cannot be captured by current reference genome-based methods. An implementation of the method is available at http://www.bork.embl.de/software/mOTU/.},
  timestamp = {2016-06-16T16:08:35Z},
  number = {12},
  journaltitle = {Nature methods},
  author = {Sunagawa, Shinichi and Mende, Daniel R and Zeller, Georg and Izquierdo-Carrasco, Fernando and a Berger, Simon and Kultima, Jens Roat and Coelho, Luis Pedro and Arumugam, Manimozhiyan and Tap, Julien and Nielsen, Henrik Bjørn and Rasmussen, Simon and Brunak, Søren and Pedersen, Oluf and Guarner, Francisco and de Vos, Willem M and Wang, Jun and Li, Junhua and Doré, Joël and Ehrlich, S Dusko and Stamatakis, Alexandros and Bork, Peer},
  date = {2013-12},
  pages = {1196--9},
  options = {useprefix=true},
  eprinttype = {pmid},
  eprint = {24141494}
}

@article{TeelingTetra2004,
  title = {{{TETRA}}: A Web-Service and a Stand-Alone Program for the Analysis and Comparison of Tetranucleotide Usage Patterns in {{DNA}} Sequences.},
  volume = {5},
  issn = {1471-2105},
  doi = {10.1186/1471-2105-5-163},
  abstract = {BACKGROUND: In the emerging field of environmental genomics, direct cloning and sequencing of genomic fragments from complex microbial communities has proven to be a valuable source of new enzymes, expanding the knowledge of basic biological processes. The central problem of this so called metagenome-approach is that the cloned fragments often lack suitable phylogenetic marker genes, rendering the identification of clones that are likely to originate from the same genome difficult or impossible. In such cases, the analysis of intrinsic DNA-signatures like tetranucleotide frequencies can provide valuable hints on fragment affiliation. With this application in mind, the TETRA web-service and the TETRA stand-alone program have been developed, both of which automate the task of comparative tetranucleotide frequency analysis. Availability: http://www.megx.net/tetra. RESULTS: TETRA provides a statistical analysis of tetranucleotide usage patterns in genomic fragments, either via a web-service or a stand-alone program. With respect to discriminatory power, such an analysis outperforms the assignment of genomic fragments based on the (G+C)-content, which is a widely-used sequence-based measure for assessing fragment relatedness. While the web-service is restricted to the calculation of correlation coefficients between tetranucleotide usage patterns of submitted DNA sequences, the stand-alone program generates a much more detailed output, comprising all raw data and graphical plots. The stand-alone program is controlled via a graphical user interface and can batch-process a multitude of sequences. Furthermore, it comes with pre-computed tetranucleotide usage patterns for 166 prokaryote chromosomes, providing a useful reference dataset and source for data-mining. CONCLUSIONS: Up to now, the analysis of skewed oligonucleotide distributions within DNA sequences is not a commonly used tool within metagenomics. With the TETRA web-service and stand-alone program, the method is now accessible in an easy to use manner for a broad audience. This will hopefully facilitate the interrelation of genomic fragments from metagenome libraries, ultimately leading to new insights into the genetic potentials of yet uncultured microorganisms.},
  timestamp = {2016-06-16T16:08:35Z},
  journaltitle = {BMC bioinformatics},
  author = {Teeling, Hanno and Waldmann, Jost and Lombardot, Thierry and Bauer, Margarete and Glöckner, Frank Oliver},
  date = {2004-10},
  pages = {163},
  keywords = {Bacterial,Bacterial: genetics,Base Composition,Base Composition: genetics,Bradyrhizobium,Bradyrhizobium: genetics,Chromosomes,DNA,DNA: methods,Escherichia,Escherichia: genetics,Genome,Internet,Microsatellite Repeats,Microsatellite Repeats: genetics,Prochlorococcus,Prochlorococcus: genetics,Sequence Analysis,Shigella,Shigella: genetics,Sinorhizobium,Sinorhizobium: genetics,Software,taxonomic binning,unsupervised taxonomic binning,Yersinia,Yersinia: genetics},
  eprinttype = {pmid},
  eprint = {15507136}
}

@article{ThompsonProperties2011,
  title = {The Properties and Applications of Single-Molecule {{DNA}} Sequencing.},
  volume = {12},
  issn = {1465-6914},
  doi = {10.1186/gb-2011-12-2-217},
  abstract = {Single-molecule sequencing enables DNA or RNA to be sequenced directly from biological samples, making it well-suited for diagnostic and clinical applications. Here we review the properties and applications of this rapidly evolving and promising technology.},
  timestamp = {2016-06-16T16:08:35Z},
  number = {2},
  journaltitle = {Genome biology},
  author = {Thompson, John F and Milos, Patrice M},
  date = {2011-01},
  pages = {217},
  keywords = {Animals,DNA,DNA: analysis,DNA-Directed DNA Polymerase,DNA-Directed DNA Polymerase: genetics,DNA-Directed DNA Polymerase: metabolism,DNA: genetics,DNA: instrumentation,DNA: methods,DNA Primers,DNA Primers: chemistry,DNA Primers: genetics,Fluorescence,Fluorescence Resonance Energy Transfer,Genomics,Genomics: instrumentation,Genomics: methods,High-Throughput Nucleotide Sequencing,High-Throughput Nucleotide Sequencing: instrumenta,High-Throughput Nucleotide Sequencing: methods,Humans,Molecular Probes,Molecular Probes: chemistry,Molecular Probes: metabolism,Nanotechnology,Nanotechnology: instrumentation,Nanotechnology: methods,Quantum Dots,RNA,RNA: analysis,RNA: genetics,Sequence Analysis},
  eprinttype = {pmid},
  eprint = {21349208}
}

@article{TringeComparative2005,
  title = {Comparative Metagenomics of Microbial Communities.},
  volume = {308},
  issn = {1095-9203},
  doi = {10.1126/science.1107851},
  abstract = {The species complexity of microbial communities and challenges in culturing representative isolates make it difficult to obtain assembled genomes. Here we characterize and compare the metabolic capabilities of terrestrial and marine microbial communities using largely unassembled sequence data obtained by shotgun sequencing DNA isolated from the various environments. Quantitative gene content analysis reveals habitat-specific fingerprints that reflect known characteristics of the sampled environments. The identification of environment-specific genes through a gene-centric comparative analysis presents new opportunities for interpreting and diagnosing environments.},
  timestamp = {2016-06-16T16:08:39Z},
  number = {5721},
  journaltitle = {Science (New York, N.Y.)},
  author = {Tringe, Susannah Green and von Mering, Christian and Kobayashi, Arthur and a Salamov, Asaf and Chen, Kevin and Chang, Hwai W and Podar, Mircea and Short, Jay M and Mathur, Eric J and Detter, John C and Bork, Peer and Hugenholtz, Philip and Rubin, Edward M},
  date = {2005-04},
  pages = {554--7},
  keywords = {Animals,Archaea,Archaea: classification,Archaea: genetics,Archaea: metabolism,Bacteria,Bacteria: classification,Bacteria: genetics,Bacterial,Bacterial Proteins,Bacterial Proteins: genetics,Bacterial Proteins: metabolism,Bacteria: metabolism,Biodiversity,Biofilms,Bone and Bones,Bone and Bones: microbiology,Computational Biology,DNA,Ecosystem,Energy Metabolism,Eukaryotic Cells,Eukaryotic Cells: metabolism,Gene Library,Genes,Genome,Genomics,Molecular Sequence Data,Operon,Phylogeny,Polymerase Chain Reaction,Proteins,Proteins: genetics,Proteins: metabolism,Proteome,Seawater,Seawater: microbiology,Sequence Analysis,Soil Microbiology,Whales,Whales: microbiology},
  options = {useprefix=true},
  eprinttype = {pmid},
  eprint = {15845853}
}

@article{TurnbaughOrganismal2010,
  title = {Organismal, Genetic, and Transcriptional Variation in the Deeply Sequenced Gut Microbiomes of Identical Twins.},
  volume = {107},
  issn = {1091-6490},
  doi = {10.1073/pnas.1002355107},
  abstract = {We deeply sampled the organismal, genetic, and transcriptional diversity in fecal samples collected from a monozygotic (MZ) twin pair and compared the results to 1,095 communities from the gut and other body habitats of related and unrelated individuals. Using a new scheme for noise reduction in pyrosequencing data, we estimated the total diversity of species-level bacterial phylotypes in the 1.2-1.5 million bacterial 16S rRNA reads obtained from each deeply sampled cotwin to be approximately 800 (35.9\%, 49.1\% detected in both). A combined 1.1 million read 16S rRNA dataset representing 281 shallowly sequenced fecal samples from 54 twin pairs and their mothers contained an estimated 4,018 species-level phylotypes, with each sample having a unique species assemblage (53.4 +/- 0.6\% and 50.3 +/- 0.5\% overlap with the deeply sampled cotwins). Of the 134 phylotypes with a relative abundance of $\backslash$textgreater0.1\% in the combined dataset, only 37 appeared in $\backslash$textgreater50\% of the samples, with one phylotype in the Lachnospiraceae family present in 99\%. Nongut communities had significantly reduced overlap with the deeply sequenced twins' fecal microbiota (18.3 +/- 0.3\%, 15.3 +/- 0.3\%). The MZ cotwins' fecal DNA was deeply sequenced (3.8-6.3 Gbp/sample) and assembled reads were assigned to 25 genus-level phylogenetic bins. Only 17\% of the genes in these bins were shared between the cotwins. Bins exhibited differences in their degree of sequence variation, gene content including the repertoire of carbohydrate active enzymes present within and between twins (e.g., predicted cellulases, dockerins), and transcriptional activities. These results provide an expanded perspective about features that make each of us unique life forms and directions for future characterization of our gut ecosystems.},
  timestamp = {2016-06-16T16:08:41Z},
  number = {16},
  journaltitle = {Proceedings of the National Academy of Sciences of the United States of America},
  author = {Turnbaugh, Peter J and Quince, Christopher and Faith, Jeremiah J and McHardy, Alice C and Yatsunenko, Tanya and Niazi, Faheem and Affourtit, Jason and Egholm, Michael and Henrissat, Bernard and Knight, Rob and Gordon, Jeffrey I},
  date = {2010-04},
  pages = {7503--8},
  keywords = {16S,16S: metabolism,Adult,Algorithms,Bacteria,Bacteria: classification,Bacteria: genetics,Carbohydrates,Carbohydrates: chemistry,Feces,Female,Gastrointestinal Tract,Gastrointestinal Tract: microbiology,Genetic,Genetic Variation,Humans,Male,Models,Monozygotic,Obesity,Obesity: complications,Phylogeny,Ribosomal,RNA,Transcription,Twins},
  eprinttype = {pmid},
  eprint = {20363958}
}

@article{TysonCommunity2004,
  title = {Community Structure and Metabolism through Reconstruction of Microbial Genomes from the Environment.},
  volume = {428},
  issn = {1476-4687},
  doi = {10.1038/nature02340},
  abstract = {Microbial communities are vital in the functioning of all ecosystems; however, most microorganisms are uncultivated, and their roles in natural systems are unclear. Here, using random shotgun sequencing of DNA from a natural acidophilic biofilm, we report reconstruction of near-complete genomes of Leptospirillum group II and Ferroplasma type II, and partial recovery of three other genomes. This was possible because the biofilm was dominated by a small number of species populations and the frequency of genomic rearrangements and gene insertions or deletions was relatively low. Because each sequence read came from a different individual, we could determine that single-nucleotide polymorphisms are the predominant form of heterogeneity at the strain level. The Leptospirillum group II genome had remarkably few nucleotide polymorphisms, despite the existence of low-abundance variants. The Ferroplasma type II genome seems to be a composite from three ancestral strains that have undergone homologous recombination to form a large population of mosaic genomes. Analysis of the gene complement for each organism revealed the pathways for carbon and nitrogen fixation and energy generation, and provided insights into survival strategies in an extreme environment.},
  timestamp = {2016-06-16T16:08:42Z},
  number = {6978},
  journaltitle = {Nature},
  author = {Tyson, Gene W and Chapman, Jarrod and Hugenholtz, Philip and Allen, Eric E and Ram, Rachna J and Richardson, Paul M and Solovyev, Victor V and Rubin, Edward M and Rokhsar, Daniel S and Banfield, Jillian F},
  date = {2004-03},
  pages = {37--43},
  keywords = {16S,16S: genetics,Archaea,Archaea: classification,Archaea: genetics,Archaeal,Archaeal: genetics,Archaea: metabolism,Bacteria,Bacteria: classification,Bacteria: genetics,Bacterial,Bacterial: genetics,Bacteria: metabolism,Base Composition,Base Sequence,Biofilms,Biofilms: growth & development,Carbon,Carbon: metabolism,DNA,Ecosystem,Environmental Microbiology,Genes,Genetic,Genetic Complementation Test,Genetic: genetics,Genome,Genomics,Molecular Sequence Data,Nitrogen Fixation,Open Reading Frames,Open Reading Frames: genetics,Phylogeny,Polymorphism,Recombination,Ribosomal,RNA,Sequence Analysis,Single Nucleotide,Single Nucleotide: genetics,Species Specificity},
  eprinttype = {pmid},
  eprint = {14961025}
}

@article{VenterEnvironmental2004,
  title = {Environmental Genome Shotgun Sequencing of the {{Sargasso Sea}}.},
  volume = {304},
  issn = {1095-9203},
  doi = {10.1126/science.1093857},
  abstract = {We have applied "whole-genome shotgun sequencing" to microbial populations collected en masse on tangential flow and impact filters from seawater samples collected from the Sargasso Sea near Bermuda. A total of 1.045 billion base pairs of nonredundant sequence was generated, annotated, and analyzed to elucidate the gene content, diversity, and relative abundance of the organisms within these environmental samples. These data are estimated to derive from at least 1800 genomic species based on sequence relatedness, including 148 previously unknown bacterial phylotypes. We have identified over 1.2 million previously unknown genes represented in these samples, including more than 782 new rhodopsin-like photoreceptors. Variation in species present and stoichiometry suggests substantial oceanic microbial diversity.},
  timestamp = {2016-06-16T16:08:44Z},
  number = {5667},
  journaltitle = {Science (New York, N.Y.)},
  author = {Venter, J Craig and Remington, Karin and Heidelberg, John F and Halpern, Aaron L and Rusch, Doug and a Eisen, Jonathan and Wu, Dongying and Paulsen, Ian and Nelson, Karen E and Nelson, William and Fouts, Derrick E and Levy, Samuel and Knap, Anthony H and Lomas, Michael W and Nealson, Ken and White, Owen and Peterson, Jeremy and Hoffman, Jeff and Parsons, Rachel and Baden-Tillson, Holly and Pfannkoch, Cynthia and Rogers, Yu-Hui and Smith, Hamilton O},
  date = {2004-04},
  pages = {66--74},
  keywords = {Archaea,Archaea: genetics,Archaeal,Atlantic Ocean,Bacteria,Bacteria: genetics,Bacterial,Bacteriophages,Bacteriophages: genetics,Biodiversity,Computational Biology,Cyanobacteria,Cyanobacteria: genetics,Cyanobacteria: growth & development,Cyanobacteria: metabolism,DNA,Ecosystem,Eukaryotic Cells,Genes,Genome,Genomics,Molecular Sequence Data,Photosynthesis,Phylogeny,Plasmids,Rhodopsin,Rhodopsin: genetics,rRNA,Seawater,Seawater: microbiology,Sequence Analysis,Water Microbiology},
  eprinttype = {pmid},
  eprint = {15001713}
}

@article{WangNaive2007,
  title = {Naive {{Bayesian}} Classifier for Rapid Assignment of {{rRNA}} Sequences into the New Bacterial Taxonomy.},
  volume = {73},
  issn = {0099-2240},
  doi = {10.1128/AEM.00062-07},
  abstract = {The Ribosomal Database Project (RDP) Classifier, a naïve Bayesian classifier, can rapidly and accurately classify bacterial 16S rRNA sequences into the new higher-order taxonomy proposed in Bergey's Taxonomic Outline of the Prokaryotes (2nd ed., release 5.0, Springer-Verlag, New York, NY, 2004). It provides taxonomic assignments from domain to genus, with confidence estimates for each assignment. The majority of classifications (98\%) were of high estimated confidence ($\backslash$textgreater or = 95\%) and high accuracy (98\%). In addition to being tested with the corpus of 5,014 type strain sequences from Bergey's outline, the RDP Classifier was tested with a corpus of 23,095 rRNA sequences as assigned by the NCBI into their alternative higher-order taxonomy. The results from leave-one-out testing on both corpora show that the overall accuracies at all levels of confidence for near-full-length and 400-base segments were 89\% or above down to the genus level, and the majority of the classification errors appear to be due to anomalies in the current taxonomies. For shorter rRNA segments, such as those that might be generated by pyrosequencing, the error rate varied greatly over the length of the 16S rRNA gene, with segments around the V2 and V4 variable regions giving the lowest error rates. The RDP Classifier is suitable both for the analysis of single rRNA sequences and for the analysis of libraries of thousands of sequences. Another related tool, RDP Library Compare, was developed to facilitate microbial-community comparison based on 16S rRNA gene sequence libraries. It combines the RDP Classifier with a statistical test to flag taxa differentially represented between samples. The RDP Classifier and RDP Library Compare are available online at http://rdp.cme.msu.edu/.},
  timestamp = {2016-06-16T16:08:46Z},
  number = {16},
  journaltitle = {Applied and environmental microbiology},
  author = {Wang, Qiong and Garrity, George M and Tiedje, James M and Cole, James R},
  date = {2007-08},
  pages = {5261--7},
  keywords = {16S,16S: genetics,Abbildung 16S,Algorithms,Bacteria,Bacteria: classification,Bacteria: genetics,Bayes Theorem,Classification,Classification: methods,Databases,Nucleic Acid,Phylogeny,Ribosomal,Ribosomal: genetics,RNA},
  eprinttype = {pmid},
  eprint = {17586664}
}

@article{WarneckeMetagenomic2007,
  title = {Metagenomic and Functional Analysis of Hindgut Microbiota of a Wood-Feeding Higher Termite.},
  volume = {450},
  issn = {1476-4687},
  doi = {10.1038/nature06269},
  abstract = {From the standpoints of both basic research and biotechnology, there is considerable interest in reaching a clearer understanding of the diversity of biological mechanisms employed during lignocellulose degradation. Globally, termites are an extremely successful group of wood-degrading organisms and are therefore important both for their roles in carbon turnover in the environment and as potential sources of biochemical catalysts for efforts aimed at converting wood into biofuels. Only recently have data supported any direct role for the symbiotic bacteria in the gut of the termite in cellulose and xylan hydrolysis. Here we use a metagenomic analysis of the bacterial community resident in the hindgut paunch of a wood-feeding 'higher' Nasutitermes species (which do not contain cellulose-fermenting protozoa) to show the presence of a large, diverse set of bacterial genes for cellulose and xylan hydrolysis. Many of these genes were expressed in vivo or had cellulase activity in vitro, and further analyses implicate spirochete and fibrobacter species in gut lignocellulose degradation. New insights into other important symbiotic functions including H2 metabolism, CO2-reductive acetogenesis and N2 fixation are also provided by this first system-wide gene analysis of a microbial community specialized towards plant lignocellulose degradation. Our results underscore how complex even a 1-microl environment can be.},
  timestamp = {2016-06-16T16:08:46Z},
  number = {7169},
  journaltitle = {Nature},
  author = {Warnecke, Falk and Luginbühl, Peter and Ivanova, Natalia and Ghassemian, Majid and Richardson, Toby H and Stege, Justin T and Cayouette, Michelle and McHardy, Alice C and Djordjevic, Gordana and Aboushadi, Nahla and Sorek, Rotem and Tringe, Susannah G and Podar, Mircea and Martin, Hector Garcia and Kunin, Victor and Dalevi, Daniel and Madejska, Julita and Kirton, Edward and Platt, Darren and Szeto, Ernest and Salamov, Asaf and Barry, Kerrie and Mikhailova, Natalia and Kyrpides, Nikos C and Matson, Eric G and a Ottesen, Elizabeth and Zhang, Xinning and Hernández, Myriam and Murillo, Catalina and Acosta, Luis G and Rigoutsos, Isidore and Tamayo, Giselle and Green, Brian D and Chang, Cathy and Rubin, Edward M and Mathur, Eric J and Robertson, Dan E and Hugenholtz, Philip and Leadbetter, Jared R},
  date = {2007-11},
  pages = {560--5},
  keywords = {Animals,Bacteria,Bacteria: enzymology,Bacteria: genetics,Bacteria: isolation & purification,Bacterial,Bacterial: genetics,Bacteria: metabolism,Bioelectric Energy Sources,Biological,Carbon,Carbon: metabolism,Catalytic Domain,Cellulose,Cellulose: metabolism,Costa Rica,Genes,Genome,Genomics,Glycoside Hydrolases,Glycoside Hydrolases: chemistry,Glycoside Hydrolases: genetics,Glycoside Hydrolases: metabolism,Hydrolysis,Intestines,Intestines: microbiology,Isoptera,Isoptera: metabolism,Isoptera: microbiology,Lignin,Lignin: metabolism,Models,Molecular Sequence Data,Polymerase Chain Reaction,symbiosis,Wood,Wood: chemistry,Wood: metabolism,Xylans,Xylans: metabolism},
  eprinttype = {pmid},
  eprint = {18033299}
}

@article{WeberPractical2011,
  title = {Practical Application of Self-Organizing Maps to Interrelate Biodiversity and Functional Data in {{NGS}}-Based Metagenomics.},
  volume = {5},
  issn = {1751-7370},
  doi = {10.1038/ismej.2010.180},
  abstract = {Next-generation sequencing (NGS) technologies have enabled the application of broad-scale sequencing in microbial biodiversity and metagenome studies. Biodiversity is usually targeted by classifying 16S ribosomal RNA genes, while metagenomic approaches target metabolic genes. However, both approaches remain isolated, as long as the taxonomic and functional information cannot be interrelated. Techniques like self-organizing maps (SOMs) have been applied to cluster metagenomes into taxon-specific bins in order to link biodiversity with functions, but have not been applied to broad-scale NGS-based metagenomics yet. Here, we provide a novel implementation, demonstrate its potential and practicability, and provide a web-based service for public usage. Evaluation with published data sets mimicking varyingly complex habitats resulted into classification specificities and sensitivities of close to 100\% to above 90\% from phylum to genus level for assemblies exceeding 8 kb for low and medium complexity data. When applied to five real-world metagenomes of medium complexity from direct pyrosequencing of marine subsurface waters, classifications of assemblies above 2.5 kb were in good agreement with fluorescence in situ hybridizations, indicating that biodiversity was mostly retained within the metagenomes, and confirming high classification specificities. This was validated by two protein-based classifications (PBCs) methods. SOMs were able to retrieve the relevant taxa down to the genus level, while surpassing PBCs in resolution. In order to make the approach accessible to a broad audience, we implemented a feature-rich web-based SOM application named TaxSOM, which is freely available at http://www.megx.net/toolbox/taxsom. TaxSOM can classify reads or assemblies exceeding 2.5 kb with high accuracy and thus assists in linking biodiversity and functions in metagenome studies, which is a precondition to study microbial ecology in a holistic fashion.},
  timestamp = {2016-06-16T16:08:47Z},
  number = {5},
  journaltitle = {The ISME journal},
  author = {Weber, Marc and Teeling, Hanno and Huang, Sixing and Waldmann, Jost and Kassabgy, Mariette and Fuchs, Bernhard M and Klindworth, Anna and Klockow, Christine and Wichels, Antje and Gerdts, Gunnar and Amann, Rudolf and Glöckner, Frank Oliver},
  date = {2011-05},
  pages = {918--28},
  keywords = {binning,Classification,metagenomics,molecular ecology,self-organizing map,som,taxonomic,taxsom},
  eprinttype = {pmid},
  eprint = {21160538}
}

@inproceedings{WilkeningUsing2009,
  title = {Using Clouds for Metagenomics: {{A}} Case Study},
  isbn = {978-1-4244-5011-4},
  doi = {10.1109/CLUSTR.2009.5289187},
  timestamp = {2016-06-16T16:08:50Z},
  booktitle = {2009 {{IEEE International Conference}} on {{Cluster Computing}} and {{Workshops}}},
  publisher = {{IEEE}},
  author = {Wilkening, Jared and Wilke, Andreas and Desai, Narayan and Meyer, Folker},
  date = {2009},
  pages = {1--6}
}

@article{WoodKraken2014,
  title = {Kraken: Ultrafast Metagenomic Sequence Classification Using Exact Alignments.},
  volume = {15},
  issn = {1465-6914},
  doi = {10.1186/gb-2014-15-3-r46},
  abstract = {Kraken is an ultrafast and highly accurate program for assigning taxonomic labels to metagenomic DNA sequences. Previous programs designed for this task have been relatively slow and computationally expensive, forcing researchers to use faster abundance estimation programs, which only classify small subsets of metagenomic data. Using exact alignment of k-mers, Kraken achieves classification accuracy comparable to the fastest BLAST program. In its fastest mode, Kraken classifies 100 base pair reads at a rate of over 4.1 million reads per minute, 909 times faster than Megablast and 11 times faster than the abundance estimation program MetaPhlAn. Kraken is available at http://ccb.jhu.edu/software/kraken/.},
  timestamp = {2016-06-16T16:08:51Z},
  number = {3},
  journaltitle = {Genome biology},
  author = {Wood, Derrick E and Salzberg, Steven L},
  date = {2014-03},
  pages = {R46},
  keywords = {metagenomics,microbiome,next-generation sequencing,Sequence Alignment,sequence classification},
  eprinttype = {pmid},
  eprint = {24580807}
}

@article{WoykeSymbiosis2006,
  title = {Symbiosis Insights through Metagenomic Analysis of a Microbial Consortium.},
  volume = {443},
  issn = {1476-4687},
  doi = {10.1038/nature05192},
  abstract = {Symbioses between bacteria and eukaryotes are ubiquitous, yet our understanding of the interactions driving these associations is hampered by our inability to cultivate most host-associated microbes. Here we use a metagenomic approach to describe four co-occurring symbionts from the marine oligochaete Olavius algarvensis, a worm lacking a mouth, gut and nephridia. Shotgun sequencing and metabolic pathway reconstruction revealed that the symbionts are sulphur-oxidizing and sulphate-reducing bacteria, all of which are capable of carbon fixation, thus providing the host with multiple sources of nutrition. Molecular evidence for the uptake and recycling of worm waste products by the symbionts suggests how the worm could eliminate its excretory system, an adaptation unique among annelid worms. We propose a model that describes how the versatile metabolism within this symbiotic consortium provides the host with an optimal energy supply as it shuttles between the upper oxic and lower anoxic coastal sediments that it inhabits.},
  timestamp = {2016-06-16T16:08:52Z},
  number = {7114},
  journaltitle = {Nature},
  author = {Woyke, Tanja and Teeling, Hanno and Ivanova, Natalia N and Huntemann, Marcel and Richter, Michael and Gloeckner, Frank Oliver and Boffelli, Dario and Anderson, Iain J and Barry, Kerrie W and Shapiro, Harris J and Szeto, Ernest and Kyrpides, Nikos C and Mussmann, Marc and Amann, Rudolf and Bergin, Claudia and Ruehland, Caroline and Rubin, Edward M and Dubilier, Nicole},
  date = {2006-10},
  pages = {950--5},
  keywords = {Animals,Biological,Carbon,Carbon: metabolism,Digestion,Digestion: physiology,Energy Metabolism,Environment,Genomics,metagenomics,Microbiology,Models,Oligochaeta,Oligochaeta: microbiology,Oligochaeta: physiology,Proteobacteria,Proteobacteria: genetics,Proteobacteria: metabolism,symbiosis,Symbiosis: genetics,Symbiosis: physiology,taxonomic binning,unsupervised taxonomic binning},
  eprinttype = {pmid},
  eprint = {16980956}
}

@article{WoykeOne2010,
  title = {One Bacterial Cell, One Complete Genome.},
  volume = {5},
  issn = {1932-6203},
  doi = {10.1371/journal.pone.0010314},
  abstract = {While the bulk of the finished microbial genomes sequenced to date are derived from cultured bacterial and archaeal representatives, the vast majority of microorganisms elude current culturing attempts, severely limiting the ability to recover complete or even partial genomes from these environmental species. Single cell genomics is a novel culture-independent approach, which enables access to the genetic material of an individual cell. No single cell genome has to our knowledge been closed and finished to date. Here we report the completed genome from an uncultured single cell of Candidatus Sulcia muelleri DMIN. Digital PCR on single symbiont cells isolated from the bacteriome of the green sharpshooter Draeculacephala minerva bacteriome allowed us to assess that this bacteria is polyploid with genome copies ranging from approximately 200-900 per cell, making it a most suitable target for single cell finishing efforts. For single cell shotgun sequencing, an individual Sulcia cell was isolated and whole genome amplified by multiple displacement amplification (MDA). Sanger-based finishing methods allowed us to close the genome. To verify the correctness of our single cell genome and exclude MDA-derived artifacts, we independently shotgun sequenced and assembled the Sulcia genome from pooled bacteriomes using a metagenomic approach, yielding a nearly identical genome. Four variations we detected appear to be genuine biological differences between the two samples. Comparison of the single cell genome with bacteriome metagenomic sequence data detected two single nucleotide polymorphisms (SNPs), indicating extremely low genetic diversity within a Sulcia population. This study demonstrates the power of single cell genomics to generate a complete, high quality, non-composite reference genome within an environmental sample, which can be used for population genetic analyzes.},
  timestamp = {2016-06-16T16:08:53Z},
  number = {4},
  journaltitle = {PloS one},
  author = {Woyke, Tanja and Tighe, Damon and Mavromatis, Konstantinos and Clum, Alicia and Copeland, Alex and Schackwitz, Wendy and Lapidus, Alla and Wu, Dongying and McCutcheon, John P and McDonald, Bradon R and a Moran, Nancy and Bristow, James and Cheng, Jan-Fang},
  date = {2010-01},
  pages = {e10314},
  keywords = {Bacteria,Bacteria: cytology,Bacteria: genetics,Bacterial,Bacterial: genetics,DNA,Genes,Genetic Variation,Genome,Genomics,Genomics: methods,metagenomics,Nucleic Acid Amplification Techniques,Nucleic Acid Amplification Techniques: methods,Polymorphism,Sequence Analysis,Single Nucleotide},
  eprinttype = {pmid},
  eprint = {20428247}
}

@article{WoykeAssembling2009,
  title = {Assembling the Marine Metagenome, One Cell at a Time.},
  volume = {4},
  issn = {1932-6203},
  doi = {10.1371/journal.pone.0005299},
  abstract = {The difficulty associated with the cultivation of most microorganisms and the complexity of natural microbial assemblages, such as marine plankton or human microbiome, hinder genome reconstruction of representative taxa using cultivation or metagenomic approaches. Here we used an alternative, single cell sequencing approach to obtain high-quality genome assemblies of two uncultured, numerically significant marine microorganisms. We employed fluorescence-activated cell sorting and multiple displacement amplification to obtain hundreds of micrograms of genomic DNA from individual, uncultured cells of two marine flavobacteria from the Gulf of Maine that were phylogenetically distant from existing cultured strains. Shotgun sequencing and genome finishing yielded 1.9 Mbp in 17 contigs and 1.5 Mbp in 21 contigs for the two flavobacteria, with estimated genome recoveries of about 91\% and 78\%, respectively. Only 0.24\% of the assembling sequences were contaminants and were removed from further analysis using rigorous quality control. In contrast to all cultured strains of marine flavobacteria, the two single cell genomes were excellent Global Ocean Sampling (GOS) metagenome fragment recruiters, demonstrating their numerical significance in the ocean. The geographic distribution of GOS recruits along the Northwest Atlantic coast coincided with ocean surface currents. Metabolic reconstruction indicated diverse potential energy sources, including biopolymer degradation, proteorhodopsin photometabolism, and hydrogen oxidation. Compared to cultured relatives, the two uncultured flavobacteria have small genome sizes, few non-coding nucleotides, and few paralogous genes, suggesting adaptations to narrow ecological niches. These features may have contributed to the abundance of the two taxa in specific regions of the ocean, and may have hindered their cultivation. We demonstrate the power of single cell DNA sequencing to generate reference genomes of uncultured taxa from a complex microbial community of marine bacterioplankton. A combination of single cell genomics and metagenomics enabled us to analyze the genome content, metabolic adaptations, and biogeography of these taxa.},
  timestamp = {2016-06-16T16:08:53Z},
  number = {4},
  journaltitle = {PloS one},
  author = {Woyke, Tanja and Xie, Gary and Copeland, Alex and González, José M and Han, Cliff and Kiss, Hajnalka and Saw, Jimmy H and Senin, Pavel and Yang, Chi and Chatterji, Sourav and Cheng, Jan-Fang and a Eisen, Jonathan and Sieracki, Michael E and Stepanauskas, Ramunas},
  date = {2009-01},
  pages = {e5299},
  keywords = {16S,16S: genetics,16S: metabolism,Animals,Bacterial,Bacterial: genetics,Biodiversity,DNA,Genes,Genome,Genomics,Genomics: methods,Marine Biology,Phylogeny,Plankton,Rhodopsin,Rhodopsin: genetics,Ribosomal,RNA,Sequence Analysis},
  eprinttype = {pmid},
  eprint = {19390573}
}

@article{WuPhylogenydriven2009,
  title = {A Phylogeny-Driven Genomic Encyclopaedia of {{Bacteria}} and {{Archaea}}.},
  volume = {462},
  issn = {1476-4687},
  doi = {10.1038/nature08656},
  abstract = {Sequencing of bacterial and archaeal genomes has revolutionized our understanding of the many roles played by microorganisms. There are now nearly 1,000 completed bacterial and archaeal genomes available, most of which were chosen for sequencing on the basis of their physiology. As a result, the perspective provided by the currently available genomes is limited by a highly biased phylogenetic distribution. To explore the value added by choosing microbial genomes for sequencing on the basis of their evolutionary relationships, we have sequenced and analysed the genomes of 56 culturable species of Bacteria and Archaea selected to maximize phylogenetic coverage. Analysis of these genomes demonstrated pronounced benefits (compared to an equivalent set of genomes randomly selected from the existing database) in diverse areas including the reconstruction of phylogenetic history, the discovery of new protein families and biological properties, and the prediction of functions for known genes from other organisms. Our results strongly support the need for systematic 'phylogenomic' efforts to compile a phylogeny-driven 'Genomic Encyclopedia of Bacteria and Archaea' in order to derive maximum knowledge from existing microbial genome data as well as from genome sequences to come.},
  timestamp = {2016-06-16T16:08:54Z},
  number = {7276},
  journaltitle = {Nature},
  author = {Wu, Dongying and Hugenholtz, Philip and Mavromatis, Konstantinos and Pukall, Rüdiger and Dalin, Eileen and Ivanova, Natalia N and Kunin, Victor and Goodwin, Lynne and Wu, Martin and Tindall, Brian J and Hooper, Sean D and Pati, Amrita and Lykidis, Athanasios and Spring, Stefan and Anderson, Iain J and D'haeseleer, Patrik and Zemla, Adam and Singer, Mitchell and Lapidus, Alla and Nolan, Matt and Copeland, Alex and Han, Cliff and Chen, Feng and Cheng, Jan-Fang and Lucas, Susan and Kerfeld, Cheryl and Lang, Elke and Gronow, Sabine and Chain, Patrick and Bruce, David and Rubin, Edward M and Kyrpides, Nikos C and Klenk, Hans-Peter and a Eisen, Jonathan},
  date = {2009-12},
  pages = {1056--60},
  keywords = {Actins,Actins: chemistry,Amino Acid Sequence,Archaea,Archaea: classification,Archaea: genetics,Archaeal,Archaeal: genetics,Bacteria,Bacteria: classification,Bacteria: genetics,Bacterial,Bacterial: genetics,Bacterial Proteins,Bacterial Proteins: chemistry,Biodiversity,Databases,Genes,Genetic,Genome,Models,Molecular,Molecular Sequence Data,Phylogeny,Protein Structure,rRNA,rRNA: genetics,Sequence Alignment,Tertiary},
  eprinttype = {pmid},
  eprint = {20033048}
}

@article{WuPhylogenomic2012,
  title = {Phylogenomic {{Analysis}} of {{Bacterial}} and {{Archaeal Sequences}} with {{AMPHORA2}}.},
  issn = {1367-4811},
  doi = {10.1093/bioinformatics/bts079},
  abstract = {SUMMARY: With the explosive growth of bacterial and archaeal sequence data, large scale phylogenetic analyses present both opportunities and challenges. Here we describe AMPHORA2, an automated phylogenomic inference tool that can be used for high throughput, high quality genome tree reconstruction and metagenomic phylotyping. Compared to its predecessor, AMPHORA2 has several major enhancements and new functions: it has a greatly expanded phylogenetic marker database and can analyze both bacterial and archaeal sequences; it incorporates probability-based sequence alignment masks that improve the phylogenetic accuracy; it can analyze DNA as well as protein sequences and is more sensitive in marker identification; finally, it is over 100x faster in metagenomic phylotyping. AVAILABILITY: http://wolbachia.biology.virginia.edu/WuLab/Software.html. CONTACT: mw4yv@virginia.edu SUPPLEMENTARY INFORMATION: Supplementary data are available at Bioinformatics online.},
  timestamp = {2016-06-16T16:08:54Z},
  journaltitle = {Bioinformatics (Oxford, England)},
  author = {Wu, Martin and Scott, Alexandra J},
  date = {2012-02},
  pages = {1--2},
  eprinttype = {pmid},
  eprint = {22332237}
}

@article{WuNovel2011,
  title = {A {{Novel Abundance}}-{{Based Algorithm}} for {{Binning Metagenomic Sequences Using}} l-Tuples.},
  volume = {18},
  issn = {1557-8666},
  doi = {10.1089/cmb.2010.0245},
  abstract = {Abstract Metagenomics is the study of microbial communities sampled directly from their natural environment, without prior culturing. Among the computational tools recently developed for metagenomic sequence analysis, binning tools attempt to classify the sequences in a metagenomic dataset into different bins (i.e., species), based on various DNA composition patterns (e.g., the tetramer frequencies) of various genomes. Composition-based binning methods, however, cannot be used to classify very short fragments, because of the substantial variation of DNA composition patterns within a single genome. We developed a novel approach (AbundanceBin) for metagenomics binning by utilizing the different abundances of species living in the same environment. AbundanceBin is an application of the Lander-Waterman model to metagenomics, which is based on the l-tuple content of the reads. AbundanceBin achieved accurate, unsupervised, clustering of metagenomic sequences into different bins, such that the reads classified in a bin belong to species of identical or very similar abundances in the sample. In addition, AbundanceBin gave accurate estimations of species abundances, as well as their genome sizes-two important parameters for characterizing a microbial community. We also show that AbundanceBin performed well when the sequence lengths are very short (e.g., 75 bp) or have sequencing errors. By combining AbundanceBin and a composition-based method (MetaCluster), we can achieve even higher binning accuracy. Supplementary Material is available at www.liebertonline.com/cmb .},
  timestamp = {2016-06-16T16:08:55Z},
  number = {3},
  journaltitle = {Journal of computational biology : a journal of computational molecular cell biology},
  author = {Wu, Yu-Wei and Ye, Yuzhen},
  date = {2011-03},
  pages = {523--34},
  keywords = {binning,em algorithm,metagenomics,Poisson Distribution,taxonomic binning,unsupervised taxonomic binning},
  eprinttype = {pmid},
  eprint = {21385052}
}

@article{XieCommunity2010,
  title = {Community and Gene Composition of a Human Dental Plaque Microbiota Obtained by Metagenomic Sequencing.},
  volume = {25},
  issn = {2041-1014},
  doi = {10.1111/j.2041-1014.2010.00587.x},
  abstract = {Human dental plaque is a complex microbial community containing an estimated 700 to 19,000 species/phylotypes. Despite numerous studies analysing species richness in healthy and diseased human subjects, the true genomic composition of the human dental plaque microbiota remains unknown. Here we report a metagenomic analysis of a healthy human plaque sample using a combination of second-generation sequencing platforms. A total of 860 million base pairs of non-human sequences were generated. Various analysis tools revealed the presence of 12 well-characterized phyla, members of the TM-7 and BRC1 clade, and sequences that could not be classified. Both pathogens and opportunistic pathogens were identified, supporting the ecological plaque hypothesis for oral diseases. Mapping the metagenomic reads to sequenced reference genomes demonstrated that 4\% of the reads could be assigned to the sequenced species. Preliminary annotation identified genes belonging to all known functional categories. Interestingly, although 73\% of the total assembled contig sequences were predicted to code for proteins, only 51\% of them could be assigned a functional role. Furthermore, ∼2.8\% of the total predicted genes coded for proteins involved in resistance to antibiotics and toxic compounds, suggesting that the oral cavity is an important reservoir for antimicrobial resistance.},
  timestamp = {2016-06-16T16:08:55Z},
  number = {6},
  journaltitle = {Molecular oral microbiology},
  author = {Xie, G and Chain, P S G and Lo, C-C and Liu, K-L and Gans, J and Merritt, J and Qi, F},
  date = {2010-12},
  pages = {391--405},
  keywords = {16S,16S: genetics,Actinobacteria,Actinobacteria: classification,Bacteria,Bacteria: classification,Bacterial,Bacterial: genetics,Bacterial Proteins,Bacterial Proteins: genetics,Bacteroidetes,Bacteroidetes: classification,Chromosome Mapping,Contig Mapping,Contig Mapping: methods,Cyanobacteria,Cyanobacteria: classification,Databases,Dental Plaque,Dental Plaque: microbiology,DNA,Drug Resistance,Ecosystem,Fibrobacter,Fibrobacter: classification,Fusobacteria,Fusobacteria: classification,Gastrointestinal Tract,Gastrointestinal Tract: microbiology,Genome,Humans,Metagenome,Metagenome: genetics,metagenomics,Metagenomics: methods,Nucleic Acid,Proteobacteria,Proteobacteria: classification,Ribosomal,RNA,Sequence Analysis,Spirochaetaceae,Spirochaetaceae: classification},
  eprinttype = {pmid},
  eprint = {21040513}
}

@article{ZhaoRapsearch22011,
  title = {{{RAPSearch2}}: A Fast and Memory-Efficient Protein Similarity Search Tool for next Generation Sequencing Data},
  volume = {28},
  issn = {1367-4803},
  doi = {10.1093/bioinformatics/btr595},
  timestamp = {2016-06-16T16:08:59Z},
  number = {1},
  journaltitle = {Bioinformatics},
  author = {Zhao, Y. and Tang, H. and Ye, Y.},
  date = {2011-10},
  pages = {125--126}
}

@article{BulgarelliStructure2015,
  title = {Structure and {{Function}} of the {{Bacterial Root Microbiota}} in {{Wild}} and {{Domesticated Barley}}},
  volume = {17},
  issn = {1931-3128},
  doi = {10.1016/j.chom.2015.01.011},
  abstract = {Summary
The microbial communities inhabiting the root interior of healthy plants, as well as the rhizosphere, which consists of soil particles firmly attached to roots, engage in symbiotic associations with their host. To investigate the structural and functional diversification among these communities, we employed a combination of 16S rRNA gene profiling and shotgun metagenome analysis of the microbiota associated with wild and domesticated accessions of barley (Hordeum vulgare). Bacterial families Comamonadaceae, Flavobacteriaceae, and Rhizobiaceae dominate the barley root-enriched microbiota. Host genotype has a small, but significant, effect on the diversity of root-associated bacterial communities, possibly representing a footprint of barley domestication. Traits related to pathogenesis, secretion, phage interactions, and nutrient mobilization are enriched in the barley root-associated microbiota. Strikingly, protein families assigned to these same traits showed evidence of positive selection. Our results indicate that the combined action of microbe-microbe and host-microbe interactions drives microbiota differentiation at the root-soil interface.},
  timestamp = {2016-06-17T11:37:49Z},
  number = {3},
  journaltitle = {Cell Host \& Microbe},
  shortjournal = {Cell Host \& Microbe},
  author = {Bulgarelli, Davide and Garrido-Oter, Ruben and Münch, Philipp~C. and Weiman, Aaron and Dröge, Johannes and Pan, Yao and McHardy, Alice~C. and Schulze-Lefert, Paul},
  urldate = {2016-06-17},
  date = {2015-03-11},
  pages = {392--403},
  note = {00022}
}

@article{LuoLowdensity2016,
  title = {Low-Density Locality-Sensitive Hashing Boosts Metagenomic Binning},
  abstract = {Metagenomic binning is an essential task in analyzing metagenomic sequence datasets. To analyze structure or function of microbial communities from environmental samples, metagenomic sequence fragments are assigned to their taxonomic origins. Although sequence alignment algorithms can readily be used and usually provide high-resolution alignments and accurate binning results, the computational cost of such alignment-based methods becomes prohibitive as metagenomic datasets continue to grow. Alternative compositional-based methods, which exploit sequence composition by profiling local short k-mers in fragments, are often faster but less accurate than alignment-based methods. Inspired by the success of linear error correcting codes in noisy channel communication, we introduce Opal, a fast and accurate novel compositional-based binning method. It incorporates ideas from Gallager's low-density parity-check code to design a family of compact and discriminative locality-sensitive hashing functions that encode long-range compositional dependencies in long fragments. By incorporating the Gallager LSH functions as features in a simple linear SVM, Opal provides fast, accurate and robust binning for datasets consisting of a large number of species, even with mutations and sequencing errors. Opal not only performs up to two orders of magnitude faster than BWA, an alignment-based binning method, but also achieves improved binning accuracy and robustness to sequencing errors. Opal also outperforms models built on traditional k-mer profiles in terms of robustness and accuracy. Finally, we demonstrate that we can effectively use Opal in the "coarse search" stage of a compressive genomics pipeline to identify a much smaller candidate set of taxonomic origins for a subsequent alignment-based method to analyze, thus providing metagenomic binning with high scalability, high accuracy and high resolution.},
  timestamp = {2016-06-17T11:42:10Z},
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1604.02699},
  primaryClass = {q-bio},
  author = {Luo, Yunan and Zeng, Jianyang and Berger, Bonnie and Peng, Jian},
  urldate = {2016-06-17},
  date = {2016-04-10},
  note = {00000},
  keywords = {Quantitative Biology - Genomics,Quantitative Biology - Quantitative Methods}
}

@article{LanderGenomic1988,
  title = {Genomic Mapping by Fingerprinting Random Clones: {{A}} Mathematical Analysis},
  volume = {2},
  issn = {0888-7543},
  doi = {10.1016/0888-7543(88)90007-9},
  shorttitle = {Genomic Mapping by Fingerprinting Random Clones},
  abstract = {Results from physical mapping projects have recently been reported for the genomes of Escherichia coli, Saccharomyces cerevisiae, and Caenorhabditis elegans, and similar projects are currently being planned for other organisms. In such projects, the physical map is assembled by first “fingerprinting” a large number of clones chosen at random from a recombinant library and then inferring overlaps between clones with sufficiently similar fingerprints. Although the basic approach is the same, there are many possible choices for the fingerprint used to characterize the clones and the rules for declaring overlap. In this paper, we derive simple formulas showing how the progress of a physical mapping project is affected by the nature of the fingerprinting scheme. Using these formulas, we discuss the analytic considerations involved in selecting an appropriate fingerprinting scheme for a particular project.},
  timestamp = {2016-06-17T12:14:48Z},
  number = {3},
  journaltitle = {Genomics},
  shortjournal = {Genomics},
  author = {Lander, Eric S. and Waterman, Michael S.},
  urldate = {2016-06-17},
  date = {1988-04},
  pages = {231--239},
  note = {00726}
}

@article{KangMetabat2015,
  title = {{{MetaBAT}}, an Efficient Tool for Accurately Reconstructing Single Genomes from Complex Microbial Communities},
  volume = {3},
  issn = {2167-8359},
  doi = {10.7717/peerj.1165},
  timestamp = {2016-06-20T07:24:15Z},
  langid = {english},
  journaltitle = {PeerJ},
  author = {Kang, Dongwan D. and Froula, Jeff and Egan, Rob and Wang, Zhong},
  urldate = {2016-06-20},
  date = {2015-08-27},
  pages = {e1165},
  note = {00021}
}

@article{PrzyborowskiHomogeneity1940,
  title = {Homogeneity of {{Results}} in {{Testing Samples}} from {{Poisson Series}}: {{With}} an {{Application}} to {{Testing Clover Seed}} for {{Dodder}}},
  volume = {31},
  issn = {00063444},
  doi = {10.2307/2332612},
  shorttitle = {Homogeneity of {{Results}} in {{Testing Samples}} from {{Poisson Series}}},
  timestamp = {2016-06-20T09:05:49Z},
  eprinttype = {jstor},
  eprint = {2332612?origin=crossref},
  issue = {3/4},
  journaltitle = {Biometrika},
  author = {Przyborowski, J. and Wilenski, H.},
  date = {1940-03},
  pages = {313}
}

@article{LangmeadFast2012,
  title = {Fast Gapped-Read Alignment with {{Bowtie}} 2},
  volume = {9},
  rights = {© 2012 Nature Publishing Group, a division of Macmillan Publishers Limited. All Rights Reserved.},
  issn = {1548-7091},
  doi = {10.1038/nmeth.1923},
  abstract = {As the rate of sequencing increases, greater throughput is demanded from read aligners. The full-text minute index is often used to make alignment very fast and memory-efficient, but the approach is ill-suited to finding longer, gapped alignments. Bowtie 2 combines the strengths of the full-text minute index with the flexibility and speed of hardware-accelerated dynamic programming algorithms to achieve a combination of high speed, sensitivity and accuracy.},
  timestamp = {2016-06-20T09:10:32Z},
  langid = {english},
  number = {4},
  journaltitle = {Nature Methods},
  shortjournal = {Nat Meth},
  author = {Langmead, Ben and Salzberg, Steven L.},
  urldate = {2016-06-20},
  date = {2012-04},
  pages = {357--359},
  keywords = {Bioinformatics,Genomics,Sequencing}
}

@article{GoodwinComing2016,
  title = {Coming of Age: Ten Years of next-Generation Sequencing Technologies},
  volume = {17},
  rights = {© 2016 Nature Publishing Group, a division of Macmillan Publishers Limited. All Rights Reserved.},
  issn = {1471-0056},
  doi = {10.1038/nrg.2016.49},
  shorttitle = {Coming of Age},
  abstract = {Since the completion of the human genome project in 2003, extraordinary progress has been made in genome sequencing technologies, which has led to a decreased cost per megabase and an increase in the number and diversity of sequenced genomes. An astonishing complexity of genome architecture has been revealed, bringing these sequencing technologies to even greater advancements. Some approaches maximize the number of bases sequenced in the least amount of time, generating a wealth of data that can be used to understand increasingly complex phenotypes. Alternatively, other approaches now aim to sequence longer contiguous pieces of DNA, which are essential for resolving structurally complex regions. These and other strategies are providing researchers and clinicians a variety of tools to probe genomes in greater depth, leading to an enhanced understanding of how genome sequence variants underlie phenotype and disease.},
  timestamp = {2016-06-20T09:19:39Z},
  langid = {english},
  number = {6},
  journaltitle = {Nature Reviews Genetics},
  shortjournal = {Nat Rev Genet},
  author = {Goodwin, Sara and McPherson, John D. and McCombie, W. Richard},
  urldate = {2016-06-20},
  date = {2016-06},
  pages = {333--351},
  keywords = {DNA sequencing,Genome,Genomics,Next-generation sequencing}
}

@article{WuMaxbin2014,
  title = {{{MaxBin}}: An Automated Binning Method to Recover Individual Genomes from Metagenomes Using an Expectation-Maximization Algorithm},
  volume = {2},
  issn = {2049-2618},
  doi = {10.1186/2049-2618-2-26},
  shorttitle = {{{MaxBin}}},
  abstract = {Recovering individual genomes from metagenomic datasets allows access to uncultivated microbial populations that may have important roles in natural and engineered ecosystems. Understanding the roles of these uncultivated populations has broad application in ecology, evolution, biotechnology and medicine. Accurate binning of assembled metagenomic sequences is an essential step in recovering the genomes and understanding microbial functions.},
  timestamp = {2016-06-20T14:47:58Z},
  journaltitle = {Microbiome},
  shortjournal = {Microbiome},
  author = {Wu, Yu-Wei and Tang, Yung-Hsu and Tringe, Susannah G. and Simmons, Blake A. and Singer, Steven W.},
  urldate = {2016-06-20},
  date = {2014},
  pages = {26},
  note = {00041},
  keywords = {Binning,Expectation-maximization algorithm,Metagenomics}
}

@article{WaltNumpy2011,
  title = {The {{NumPy Array}}: {{A Structure}} for {{Efficient Numerical Computation}}},
  volume = {13},
  issn = {1521-9615},
  doi = {10.1109/MCSE.2011.37},
  shorttitle = {The {{NumPy Array}}},
  abstract = {In the Python world, NumPy arrays are the standard representation for numerical data and enable efficient implementation of numerical computations in a high-level language. As this effort shows, NumPy performance can be improved through three techniques: vectorizing calculations, avoiding copying data in memory, and minimizing operation counts.},
  timestamp = {2016-07-06T12:43:05Z},
  number = {2},
  journaltitle = {Computing in Science Engineering},
  author = {van der Walt, Stéfan and Colbert, S. Chris and Varoquaux, Gaël},
  date = {2011-03},
  pages = {22--30},
  note = {00579},
  keywords = {Arrays,Computational efficiency,data structures,Finite element methods,high level language,high level languages,mathematics computing,numerical analysis,numerical computation,numerical computations,numerical data,NumPy,numpy array,Performance evaluation,programming libraries,Python,Python programming language,Resource management,scientific programming,Vector quantization}
}

@article{KarlinCompositional1997,
  title = {Compositional Biases of Bacterial Genomes and Evolutionary Implications.},
  volume = {179},
  timestamp = {2016-07-05T12:34:49Z},
  number = {12},
  journaltitle = {Journal of bacteriology},
  author = {Karlin, Samuel and Mrazek, Jan and Campbell, Allan M.},
  urldate = {2016-07-05},
  date = {1997},
  pages = {3899--3913},
  note = {00388}
}

@article{NielsenIdentification2014,
  title = {Identification and Assembly of Genomes and Genetic Elements in Complex Metagenomic Samples without Using Reference Genomes},
  volume = {32},
  rights = {© 2014 Nature Publishing Group, a division of Macmillan Publishers Limited. All Rights Reserved.},
  issn = {1087-0156},
  doi = {10.1038/nbt.2939},
  abstract = {Most current approaches for analyzing metagenomic data rely on comparisons to reference genomes, but the microbial diversity of many environments extends far beyond what is covered by reference databases. De novo segregation of complex metagenomic data into specific biological entities, such as particular bacterial strains or viruses, remains a largely unsolved problem. Here we present a method, based on binning co-abundant genes across a series of metagenomic samples, that enables comprehensive discovery of new microbial organisms, viruses and co-inherited genetic entities and aids assembly of microbial genomes without the need for reference sequences. We demonstrate the method on data from 396 human gut microbiome samples and identify 7,381 co-abundance gene groups (CAGs), including 741 metagenomic species (MGS). We use these to assemble 238 high-quality microbial genomes and identify affiliations between MGS and hundreds of viruses or genetic entities. Our method provides the means for comprehensive profiling of the diversity within complex metagenomic samples.
View full text},
  timestamp = {2016-07-27T15:12:33Z},
  langid = {english},
  number = {8},
  journaltitle = {Nature Biotechnology},
  shortjournal = {Nat Biotech},
  author = {Nielsen, H. Bjørn and Almeida, Mathieu and Juncker, Agnieszka Sierakowska and Rasmussen, Simon and Li, Junhua and Sunagawa, Shinichi and Plichta, Damian R. and Gautier, Laurent and Pedersen, Anders G. and Le Chatelier, Emmanuelle and Pelletier, Eric and Bonde, Ida and Nielsen, Trine and Manichanh, Chaysavanh and Arumugam, Manimozhiyan and Batto, Jean-Michel and Quintanilha dos Santos, Marcelo B. and Blom, Nikolaj and Borruel, Natalia and Burgdorf, Kristoffer S. and Boumezbeur, Fouad and Casellas, Francesc and Doré, Joël and Dworzynski, Piotr and Guarner, Francisco and Hansen, Torben and Hildebrand, Falk and Kaas, Rolf S. and Kennedy, Sean and Kristiansen, Karsten and Kultima, Jens Roat and Léonard, Pierre and Levenez, Florence and Lund, Ole and Moumen, Bouziane and Le Paslier, Denis and Pons, Nicolas and Pedersen, Oluf and Prifti, Edi and Qin, Junjie and Raes, Jeroen and Sørensen, Søren and Tap, Julien and Tims, Sebastian and Ussery, David W. and Yamada, Takuji and {MetaHIT Consortium} and Renault, Pierre and Sicheritz-Ponten, Thomas and Bork, Peer and Wang, Jun and Brunak, Søren and Ehrlich, S. Dusko},
  urldate = {2016-07-27},
  date = {2014-08},
  pages = {822--828},
  note = {00134},
  keywords = {Genetic variation,Genome assembly algorithms,Microbial genetics,Time series}
}

@article{HagenQuantitative2016,
  title = {Quantitative Metaproteomics Highlight the Metabolic Contributions of Uncultured Phylotypes in a Thermophilic Anaerobic Digester},
  issn = {0099-2240, 1098-5336},
  doi = {10.1128/AEM.01955-16},
  abstract = {In this study, we used multiple meta-omic approaches to characterize the microbial community and the active metabolic pathways of a stable industrial biogas reactor with food waste as the dominant feedstock, operating at thermophilic temperatures (60°C) and elevated levels of free ammonia (367 mg NH3-N/L). The microbial community was strongly dominated (76\% of all 16S rRNA amplicon reads) by populations affiliated to the proteolytic bacterium, Coprothermobacter proteolyticus. Multiple Coprothermobacter-affiliated strains were detected, introducing an additional level of complexity seldom explored in biogas studies. Genome reconstructions provided metabolic insight into the microbes that performed biomass deconstruction and fermentation, including the deeply branching phyla Dictyoglomi, Planctomycetes and candidate phylum Atribacteria. These biomass degraders were complemented by a synergistic network of microorganisms that convert key fermentation intermediates (fatty acids) via syntrophic interactions with hydrogenotrophic methanogens, to ultimately produce methane. Interpretation of the proteomics data also suggested activity of a Methanosaeta phylotype acclimatized to high ammonia level. In particular, we report multiple novel phylotypes proposed as syntrophic acetate oxidizers, which also exerts expression of enzymes needed for both the Wood Ljungdahl pathway and β-oxidation of fatty acids to acetyl-CoA. Such an arrangement differs from known syntrophic oxidizing bacteria and presents an interesting hypothesis for future studies. Collectively, these findings provide increased insight into active metabolic roles of uncultured phylotypes and presents new synergistic relationships, both of which may contribute to the stability of the biogas reactor.
Importance Biogas production through anaerobic digestion of organic waste provides an attractive source of renewable energy and a sustainable waste management strategy. A comprehensive understanding of the microbial community that drives anaerobic digesters is essential to ensure stable and efficient energy production. Here, we characterize the intricate microbial networks and metabolic pathways in a thermophilic biogas reactor. We discuss the impact of frequently encountered microbial populations as well as the metabolism of newly discovered novel phylotypes that seem to play distinct roles within key microbial stages of anaerobic digestion in this stable high-temperature system. In particular, we draft a metabolic scenario whereby multiple uncultured SAOBs are capable of syntrophically oxidizing acetate as well as longer-chain fatty acids (via the β-oxidation and Wood-Ljundahl pathways) to hydrogen and carbon dioxide, which methanogens subsequently convert to methane.},
  timestamp = {2016-11-25T12:42:53Z},
  langid = {english},
  journaltitle = {Applied and Environmental Microbiology},
  shortjournal = {Appl. Environ. Microbiol.},
  author = {Hagen, Live H. and Frank, Jeremy A. and Zamanzadeh, Mirzaman and Eijsink, Vincent G. H. and Pope, Phillip B. and Horn, Svein J. and Arntzen, Magnus Ø},
  urldate = {2016-11-25},
  date = {2016-11-04},
  pages = {AEM.01955--16},
  note = {00000},
  eprinttype = {pmid},
  eprint = {27815274}
}

@article{DrogeTaxonomic2012,
  title = {Taxonomic Binning of Metagenome Samples Generated by Next-Generation Sequencing Technologies},
  volume = {13},
  issn = {1467-5463, 1477-4054},
  doi = {10.1093/bib/bbs031},
  abstract = {Metagenome research uses random shotgun sequencing of microbial community DNA to study the genetic sequences of its members without cultivation. This development has been strongly supported by improvements in sequencing technologies, which have rendered sequencing cheaper than before. As a consequence, downstream computational analysis of metagenome sequence samples is now faced with large amounts of complex data. One of the essential steps in metagenome analysis is reconstruction of draft genomes for populations of a community or of draft ‘pan-genomes’ for higher level clades. ‘Taxonomic binning’ corresponds to the process of assigning a taxonomic identifier to sequence fragments, based on information such as sequence similarity, sequence composition or read coverage. This is used for draft genome reconstruction, if sequencing coverage is insufficient for reconstruction based on assembly information alone. Subsequent functional and metabolic annotation of draft genomes allows a genome-level analysis of novel uncultured microbial species and even inference of their cultivation requirements.},
  timestamp = {2016-11-25T13:45:48Z},
  langid = {english},
  number = {6},
  journaltitle = {Briefings in Bioinformatics},
  shortjournal = {Brief Bioinform},
  author = {Dröge, Johannes and McHardy, Alice C.},
  urldate = {2016-11-25},
  date = {2012-01-11},
  pages = {646--655},
  note = {00063},
  keywords = {Metagenomics,Next-generation sequencing,taxonomic binning},
  eprinttype = {pmid},
  eprint = {22851513}
}

@article{DongReconstructing2017,
  title = {Reconstructing Metabolic Pathways of a Member of the Genus {{Pelotomaculum}} Suggesting Its Potential to Oxidize Benzene to Carbon Dioxide with Direct Reduction of Sulfate},
  volume = {93},
  issn = {0168-6496},
  doi = {10.1093/femsec/fiw254},
  timestamp = {2017-02-18T13:49:17Z},
  number = {3},
  journaltitle = {FEMS Microbiology Ecology},
  shortjournal = {FEMS Microbiol Ecol},
  author = {Dong, Xiyang and Dröge, Johannes and von Toerne, Christine and Marozava, Sviatlana and McHardy, Alice C. and Meckenstock, Rainer U.},
  urldate = {2017-02-18},
  date = {2017-03-01},
  options = {useprefix=true}
}

@article{EloefadroshMetagenomics2016,
  title = {Metagenomics Uncovers Gaps in Amplicon-Based Detection of Microbial Diversity},
  volume = {1},
  issn = {2058-5276},
  doi = {10.1038/nmicrobiol.2015.32},
  timestamp = {2017-02-18T14:03:27Z},
  number = {4},
  journaltitle = {Nature Microbiology},
  author = {Eloe-Fadrosh, Emiley A. and Ivanova, Natalia N. and Woyke, Tanja and Kyrpides, Nikos C.},
  urldate = {2017-02-18},
  date = {2016-02-01},
  pages = {15032}
}

@article{BremgesMecors2016,
  title = {{{MeCorS}}: {{Metagenome}}-Enabled Error Correction of Single Cell Sequencing Reads},
  volume = {32},
  issn = {1367-4803},
  doi = {10.1093/bioinformatics/btw144},
  shorttitle = {{{MeCorS}}},
  timestamp = {2017-02-18T14:09:09Z},
  number = {14},
  journaltitle = {Bioinformatics},
  shortjournal = {Bioinformatics},
  author = {Bremges, Andreas and Singer, Esther and Woyke, Tanja and Sczyrba, Alexander},
  urldate = {2017-02-18},
  date = {2016-07-15},
  pages = {2199--2201}
}

@article{MendeImproved2016,
  title = {Improved {{Environmental Genomes}} via {{Integration}} of {{Metagenomic}} and {{Single}}-{{Cell Assemblies}}},
  volume = {7},
  issn = {1664-302X},
  doi = {10.3389/fmicb.2016.00143},
  abstract = {Assembling complete or near complete genomes from complex microbial communities remains a significant challenge in metagenomic studies. Recent developments in single cell amplified genomes (SAGs) have enabled the sequencing of individual draft genomes representative of uncultivated microbial populations. SAGs suffer from incomplete and uneven coverage due to artifacts that arise from multiple displacement amplification techniques. Conversely, metagenomic sequence data does not suffer from the same biases as SAGs, and significant improvements have been realized in the recovery of draft genomes from metagenomes. Nevertheless, the inherent genomic complexity of many microbial communities often obfuscates facile generation of population genome assemblies from metagenomic data. Here we describe a new method for metagenomic-guided SAG assembly that leverages the advantages of both methods and significantly improves the completeness of initial SAGs assemblies. We demonstrate that SAG assemblies of two cosmopolitan marine lineages–Marine Group 1 Thaumarchaeota and SAR324 clade bacterioplankton–were substantially improved using this approach. Moreover, the improved assemblies strengthened biological inferences. For example, the improved SAR324 clade genome assembly revealed the presence of many genes in phenylalanine catabolism and flagellar assembly that were absent in the original SAG.},
  timestamp = {2017-02-18T14:13:13Z},
  journaltitle = {Frontiers in Microbiology},
  shortjournal = {Front Microbiol},
  author = {Mende, Daniel R. and Aylward, Frank O. and Eppley, John M. and Nielsen, Torben N. and DeLong, Edward F.},
  urldate = {2017-02-18},
  date = {2016-02-11},
  eprinttype = {pmid},
  eprint = {26904016},
  pmcid = {PMC4749706}
}

@article{GawadSinglecell2016,
  title = {Single-Cell Genome Sequencing: Current State of the Science},
  volume = {17},
  rights = {© 2016 Nature Publishing Group, a division of Macmillan Publishers Limited. All Rights Reserved.},
  issn = {1471-0056},
  doi = {10.1038/nrg.2015.16},
  shorttitle = {Single-Cell Genome Sequencing},
  abstract = {The field of single-cell genomics is advancing rapidly and is generating many new insights into complex biological systems, ranging from the diversity of microbial ecosystems to the genomics of human cancer. In this Review, we provide an overview of the current state of the field of single-cell genome sequencing. First, we focus on the technical challenges of making measurements that start from a single molecule of DNA, and then explore how some of these recent methodological advancements have enabled the discovery of unexpected new biology. Areas highlighted include the application of single-cell genomics to interrogate microbial dark matter and to evaluate the pathogenic roles of genetic mosaicism in multicellular organisms, with a focus on cancer. We then attempt to predict advances we expect to see in the next few years.
View full text},
  timestamp = {2017-02-18T14:17:02Z},
  langid = {english},
  number = {3},
  journaltitle = {Nature Reviews Genetics},
  shortjournal = {Nat Rev Genet},
  author = {Gawad, Charles and Koh, Winston and Quake, Stephen R.},
  urldate = {2017-02-18},
  date = {2016-03},
  pages = {175--188},
  keywords = {Cancer genomics,DNA sequencing,Genomics,Microbial genetics,Mosaicism,Next-generation sequencing,PCR-based techniques}
}

@article{AlnebergBinning2014,
  title = {Binning Metagenomic Contigs by Coverage and Composition},
  volume = {11},
  rights = {© 2014 Nature Publishing Group, a division of Macmillan Publishers Limited. All Rights Reserved.},
  issn = {1548-7091},
  doi = {10.1038/nmeth.3103},
  abstract = {Shotgun sequencing enables the reconstruction of genomes from complex microbial communities, but because assembly does not reconstruct entire genomes, it is necessary to bin genome fragments. Here we present CONCOCT, a new algorithm that combines sequence composition and coverage across multiple samples, to automatically cluster contigs into genomes. We demonstrate high recall and precision on artificial as well as real human gut metagenome data sets.},
  timestamp = {2017-02-18T14:24:42Z},
  langid = {english},
  number = {11},
  journaltitle = {Nature Methods},
  shortjournal = {Nat Meth},
  author = {Alneberg, Johannes and Bjarnason, Brynjar Smári and de Bruijn, Ino and Schirmer, Melanie and Quick, Joshua and Ijaz, Umer Z. and Lahti, Leo and Loman, Nicholas J. and Andersson, Anders F. and Quince, Christopher},
  urldate = {2017-02-18},
  date = {2014-11},
  pages = {1144--1146},
  keywords = {Genome informatics,Machine learning,Metagenomics},
  options = {useprefix=true}
}

@article{SedlarBioinformatics2017,
  title = {Bioinformatics Strategies for Taxonomy Independent Binning and Visualization of Sequences in Shotgun Metagenomics},
  volume = {15},
  issn = {20010370},
  doi = {10.1016/j.csbj.2016.11.005},
  timestamp = {2017-02-18T14:25:52Z},
  langid = {english},
  journaltitle = {Computational and Structural Biotechnology Journal},
  author = {Sedlar, Karel and Kupkova, Kristyna and Provaznik, Ivo},
  urldate = {2017-02-18},
  date = {2017},
  pages = {48--55}
}

@article{LinAccurate2016,
  title = {Accurate Binning of Metagenomic Contigs via Automated Clustering Sequences Using Information of Genomic Signatures and Marker Genes},
  volume = {6},
  issn = {2045-2322},
  doi = {10.1038/srep24175},
  abstract = {Metagenomics, the application of shotgun sequencing, facilitates the reconstruction of the genomes of individual species from natural environments. A major challenge in the genome recovery domain is to agglomerate or ‘bin’ sequences assembled from metagenomic reads into individual groups. Metagenomic binning without consideration of reference sequences enables the comprehensive discovery of new microbial organisms and aids in the microbial genome reconstruction process. Here we present MyCC, an automated binning tool that combines genomic signatures, marker genes and optional contig coverages within one or multiple samples, in order to visualize the metagenomes and to identify the reconstructed genomic fragments. We demonstrate the superior performance of MyCC compared to other binning tools including CONCOCT, GroopM, MaxBin and MetaBAT on both synthetic and real human gut communities with a small sample size (one to 11 samples), as well as on a large metagenome dataset (over 250 samples). Moreover, we demonstrate the visualization of metagenomes in MyCC to aid in the reconstruction of genomes from distinct bins. MyCC is freely available at http://sourceforge.net/projects/sb2nhri/files/MyCC/.},
  timestamp = {2017-02-18T14:31:21Z},
  journaltitle = {Scientific Reports},
  shortjournal = {Sci Rep},
  author = {Lin, Hsin-Hung and Liao, Yu-Chieh},
  urldate = {2017-02-18},
  date = {2016-04-12},
  eprinttype = {pmid},
  eprint = {27067514},
  pmcid = {PMC4828714}
}

@article{ErenAnvi2015,
  title = {Anvi’o: An Advanced Analysis and Visualization Platform for ‘omics Data},
  volume = {3},
  issn = {2167-8359},
  doi = {10.7717/peerj.1319},
  shorttitle = {Anvi’o},
  timestamp = {2017-02-18T14:35:05Z},
  langid = {english},
  journaltitle = {PeerJ},
  author = {Eren, A. Murat and Esen, Özcan C. and Quince, Christopher and Vineis, Joseph H. and Morrison, Hilary G. and Sogin, Mitchell L. and Delmont, Tom O.},
  urldate = {2017-02-18},
  date = {2015-10-08},
  pages = {e1319}
}

@article{FuhrmanMarine2015,
  title = {Marine Microbial Community Dynamics and Their Ecological Interpretation},
  volume = {13},
  rights = {© 2015 Nature Publishing Group, a division of Macmillan Publishers Limited. All Rights Reserved.},
  issn = {1740-1526},
  doi = {10.1038/nrmicro3417},
  abstract = {Recent advances in studying the dynamics of marine microbial communities have shown that the composition of these communities follows predictable patterns and involves complex network interactions, which shed light on the underlying processes regulating these globally important organisms. Such 'holistic' (or organism- and system-based) studies of these communities complement popular reductionist, often culture-based, approaches for understanding organism function one gene or protein at a time. In this Review, we summarize our current understanding of marine microbial community dynamics at various scales, from hours to decades. We also explain how the data illustrate community resilience and seasonality, and reveal interactions among microorganisms.
View full text},
  timestamp = {2017-02-18T14:54:45Z},
  langid = {english},
  number = {3},
  journaltitle = {Nature Reviews Microbiology},
  shortjournal = {Nat Rev Micro},
  author = {Fuhrman, Jed A. and Cram, Jacob A. and Needham, David M.},
  urldate = {2017-02-18},
  date = {2015-03},
  pages = {133--146},
  keywords = {Environmental microbiology,Microbial ecology}
}

@article{BerryDeciphering2014,
  title = {Deciphering Microbial Interactions and Detecting Keystone Species with Co-Occurrence Networks},
  volume = {5},
  issn = {1664-302X},
  doi = {10.3389/fmicb.2014.00219},
  abstract = {Co-occurrence networks produced from microbial survey sequencing data are frequently used to identify interactions between community members. While this approach has potential to reveal ecological processes, it has been insufficiently validated due to the technical limitations inherent in studying complex microbial ecosystems. Here, we simulate multi-species microbial communities with known interaction patterns using generalized Lotka-Volterra dynamics, construct co-occurrence networks, and evaluate how well networks reveal the underlying interactions, and how experimental and ecological parameters can affect network inference and interpretation. We find that co-occurrence networks can recapitulate interaction networks under certain conditions, but that they lose interpretability when the effects of habitat filtering become significant. We demonstrate that networks suffer from local hot spots of spurious correlation in the neighborhood of “hub” species that engage in many interactions. We also identify topological features associated with keystone species in co-occurrence networks. This study provides a substantiated framework to guide environmental microbiologists in the construction and interpretation of co-occurrence networks from microbial survey datasets.},
  timestamp = {2017-02-18T14:54:55Z},
  langid = {english},
  journaltitle = {Frontiers in Microbiology},
  shortjournal = {Front. Microbiol.},
  author = {Berry, David and Widder, Stefanie},
  urldate = {2017-02-18},
  date = {2014},
  keywords = {16S rRNA sequencing surveys,Correlation analysis,habitat filtering,keystone species,Lotka-Volterra models,microbial competition,microbial cooperation,Network analysis}
}

@article{GuttmanMicrobial2014,
  title = {Microbial Genome-Enabled Insights into Plant-Microorganism Interactions},
  volume = {15},
  rights = {© 2014 Nature Publishing Group, a division of Macmillan Publishers Limited. All Rights Reserved.},
  issn = {1471-0056},
  doi = {10.1038/nrg3748},
  abstract = {Advances in genome-based studies on plant-associated microorganisms have transformed our understanding of many plant pathogens and are beginning to greatly widen our knowledge of plant interactions with mutualistic and commensal microorganisms. Pathogenomics has revealed how pathogenic microorganisms adapt to particular hosts, subvert innate immune responses and change host range, as well as how new pathogen species emerge. Similarly, culture-independent community profiling methods, coupled with metagenomic and metatranscriptomic studies, have provided the first insights into the emerging field of research on plant-associated microbial communities. Together, these approaches have the potential to bridge the gap between plant microbial ecology and plant pathology, which have traditionally been two distinct research fields.
View full text},
  timestamp = {2017-02-18T14:58:24Z},
  langid = {english},
  number = {12},
  journaltitle = {Nature Reviews Genetics},
  shortjournal = {Nat Rev Genet},
  author = {Guttman, David S. and McHardy, Alice C. and Schulze-Lefert, Paul},
  urldate = {2017-02-18},
  date = {2014-12},
  pages = {797--813},
  keywords = {Metagenomics,Microbial genetics,Pathogens,Phylogenetics}
}

@article{StewartGrowing2012,
  title = {Growing {{Unculturable Bacteria}}},
  volume = {194},
  issn = {0021-9193},
  doi = {10.1128/JB.00345-12},
  timestamp = {2017-02-18T15:12:06Z},
  langid = {english},
  number = {16},
  journaltitle = {Journal of Bacteriology},
  author = {Stewart, E. J.},
  urldate = {2017-02-18},
  date = {2012-08-15},
  pages = {4151--4160}
}

@article{GillespieIsolation2002,
  title = {Isolation of {{Antibiotics Turbomycin A}} and {{B}} from a {{Metagenomic Library}} of {{Soil Microbial DNA}}},
  volume = {68},
  issn = {0099-2240, 1098-5336},
  doi = {10.1128/AEM.68.9.4301-4306.2002},
  abstract = {To access the genetic and biochemical potential of soil microorganisms by culture-independent methods, a 24,546-member library in Escherichia coli with DNA extracted directly from soil had previously been constructed (M. R. Rondon, P. R. August, A. D. Bettermann, S. F. Brady, T. H. Grossman, M. R. Liles, K. A. Loiacono, B. A. Lynch, I. A. MacNeil, M. S. Osburne, J. Clardy, J. Handelsman, and R. M. Goodman, Appl. Environ. Microbiol. 66:2541-2547, 2000). Three clones, P57G4, P89C8, and P214D2, produced colonies with a dark brown melanin-like color. We fractionated the culture supernatant of P57G4 to identify the pigmented compound or compounds. Methanol extracts of the acid precipitate from the culture supernatant contained a red and an orange pigment. Structural analysis revealed that these were triaryl cations, designated turbomycin A and turbomycin B, respectively; both exhibited broad-spectrum antibiotic activity against gram-negative and gram-positive organisms. Mutagenesis, subcloning, and sequence analysis of the 25-kb insert in P57G4 demonstrated that a single open reading frame was necessary and sufficient to confer production of the brown, orange, and red pigments on E. coli; the predicted product of this sequence shares extensive sequence similarity with members of the 4-hydroxyphenylpyruvate dioxygenase (4HPPD) family of enzymes. Another member of the same family of genes, lly, which is required for production of the hemolytic pigment in Legionella pneumophila, also conferred production of turbomycin A and B on E. coli. We further demonstrated that turbomycin A and turbomycin B are produced from the interaction of indole, normally secreted by E. coli, with homogentisic acid synthesized by the 4HPPD gene products. The results demonstrate successful heterologous expression of DNA extracted directly from soil as a means to access previously uncharacterized small organic compounds, serving as an example of a chimeric pathway for the generation of novel chemical structures.},
  timestamp = {2017-02-18T15:17:07Z},
  langid = {english},
  number = {9},
  journaltitle = {Applied and Environmental Microbiology},
  shortjournal = {Appl. Environ. Microbiol.},
  author = {Gillespie, Doreen E. and Brady, Sean F. and Bettermann, Alan D. and Cianciotto, Nicholas P. and Liles, Mark R. and Rondon, Michelle R. and Clardy, Jon and Goodman, Robert M. and Handelsman, Jo},
  urldate = {2017-02-18},
  date = {2002-01-09},
  pages = {4301--4306},
  eprinttype = {pmid},
  eprint = {12200279}
}

@article{RiesenfeldUncultured2004,
  title = {Uncultured Soil Bacteria Are a Reservoir of New Antibiotic Resistance Genes},
  volume = {6},
  issn = {1462-2920},
  doi = {10.1111/j.1462-2920.2004.00664.x},
  abstract = {Antibiotic resistance genes are typically isolated by cloning from cultured bacteria or by polymerase chain reaction (PCR) amplification from environmental samples. These methods do not access the potential reservoir of undiscovered antibiotic resistance genes harboured by soil bacteria because most soil bacteria are not cultured readily, and PCR detection of antibiotic resistance genes depends on primers that are based on known genes. To explore this reservoir, we isolated DNA directly from soil samples, cloned the DNA and selected for clones that expressed antibiotic resistance in Escherichia coli. We constructed four libraries that collectively contain 4.1 gigabases of cloned soil DNA. From these and two previously reported libraries, we identified nine clones expressing resistance to aminoglycoside antibiotics and one expressing tetracycline resistance. Based on the predicted amino acid sequences of the resistance genes, the resistance mechanisms include efflux of tetracycline and inactivation of aminoglycoside antibiotics by phosphorylation and acetylation. With one exception, all the sequences are considerably different from previously reported sequences. The results indicate that soil bacteria are a reservoir of antibiotic resistance genes with greater genetic diversity than previously accounted for, and that the diversity can be surveyed by a culture-independent method.},
  timestamp = {2017-02-18T15:17:18Z},
  langid = {english},
  number = {9},
  journaltitle = {Environmental Microbiology},
  author = {Riesenfeld, Christian S. and Goodman, Robert M. and Handelsman, Jo},
  urldate = {2017-02-18},
  date = {2004-09-01},
  pages = {981--989}
}

@article{GarrettMetagenomic2010,
  title = {Metagenomic Analyses of Novel Viruses and Plasmids from a Cultured Environmental Sample of Hyperthermophilic Neutrophiles},
  volume = {12},
  issn = {1462-2920},
  doi = {10.1111/j.1462-2920.2010.02266.x},
  abstract = {Two novel viral genomes and four plasmids were assembled from an environmental sample collected from a hot spring at Yellowstone National Park, USA, and maintained anaerobically in a bioreactor at 85°C and pH 6. The double-stranded DNA viral genomes are linear (22.7~kb) and circular (17.7~kb), and derive apparently from archaeal viruses HAV1 and HAV2. Genomic DNA was obtained from samples enriched in filamentous and tadpole-shaped virus-like particles respectively. They yielded few significant matches in public sequence databases reinforcing, further, the wide diversity of archaeal viruses. Several variants of HAV1 exhibit major genomic alterations, presumed to arise from viral adaptation to different hosts. They include insertions up to 350~bp, deletions up to 1.5~kb, and genes with extensively altered sequences. Some result from recombination events occurring at low complexity direct repeats distributed along the genome. In addition, a 33.8~kb archaeal plasmid pHA1 was characterized, encoding a possible conjugative apparatus, as well as three cryptic plasmids of thermophilic bacterial origin, pHB1 of 2.1~kb and two closely related variants pHB2a and pHB2b, of 5.2 and 4.8~kb respectively. Strategies are considered for assembling genomes of smaller genetic elements from complex environmental samples, and for establishing possible host identities on the basis of sequence similarity to host CRISPR immune systems.},
  timestamp = {2017-02-18T15:22:34Z},
  langid = {english},
  number = {11},
  journaltitle = {Environmental Microbiology},
  author = {Garrett, Roger A. and Prangishvili, David and Shah, Shiraz A. and Reuter, Monika and Stetter, Karl O. and Peng, Xu},
  urldate = {2017-02-18},
  date = {2010-11-01},
  pages = {2918--2930}
}

@article{CuvelierTargeted2010,
  title = {Targeted Metagenomics and Ecology of Globally Important Uncultured Eukaryotic Phytoplankton},
  volume = {107},
  issn = {0027-8424, 1091-6490},
  doi = {10.1073/pnas.1001665107},
  abstract = {Among eukaryotes, four major phytoplankton lineages are responsible for marine photosynthesis; prymnesiophytes, alveolates, stramenopiles, and prasinophytes. Contributions by individual taxa, however, are not well known, and genomes have been analyzed from only the latter two lineages. Tiny “picoplanktonic” members of the prymnesiophyte lineage have long been inferred to be ecologically important but remain poorly characterized. Here, we examine pico-prymnesiophyte evolutionary history and ecology using cultivation-independent methods. 18S rRNA gene analysis showed pico-prymnesiophytes belonged to broadly distributed uncultivated taxa. Therefore, we used targeted metagenomics to analyze uncultured pico-prymnesiophytes sorted by flow cytometry from subtropical North Atlantic waters. The data reveal a composite nuclear-encoded gene repertoire with strong green-lineage affiliations, which contrasts with the evolutionary history indicated by the plastid genome. Measured pico-prymnesiophyte growth rates were rapid in this region, resulting in primary production contributions similar to the cyanobacterium Prochlorococcus. On average, pico-prymnesiophytes formed 25\% of global picophytoplankton biomass, with differing contributions in five biogeographical provinces spanning tropical to subpolar systems. Elements likely contributing to success include high gene density and genes potentially involved in defense and nutrient uptake. Our findings have implications reaching beyond pico-prymnesiophytes, to the prasinophytes and stramenopiles. For example, prevalence of putative Ni-containing superoxide dismutases (SODs), instead of Fe-containing SODs, seems to be a common adaptation among eukaryotic phytoplankton for reducing Fe quotas in low-Fe modern oceans. Moreover, highly mosaic gene repertoires, although compositionally distinct for each major eukaryotic lineage, now seem to be an underlying facet of successful marine phytoplankton.},
  timestamp = {2017-02-18T15:23:19Z},
  langid = {english},
  number = {33},
  journaltitle = {Proceedings of the National Academy of Sciences},
  shortjournal = {PNAS},
  author = {Cuvelier, Marie L. and Allen, Andrew E. and Monier, Adam and McCrow, John P. and Messié, Monique and Tringe, Susannah G. and Woyke, Tanja and Welsh, Rory M. and Ishoey, Thomas and Lee, Jae-Hyeok and Binder, Brian J. and DuPont, Chris L. and Latasa, Mikel and Guigand, Cédric and Buck, Kurt R. and Hilton, Jason and Thiagarajan, Mathangi and Caler, Elisabet and Read, Betsy and Lasken, Roger S. and Chavez, Francisco P. and Worden, Alexandra Z.},
  urldate = {2017-02-18},
  date = {2010-08-17},
  pages = {14679--14684},
  keywords = {comparative genomics,haptophytes,marine photosynthesis,primary production,prymnesiophytes},
  eprinttype = {pmid},
  eprint = {20668244}
}

@article{HugenholtzMicrobiology2008,
  title = {Microbiology: {{Metagenomics}}},
  volume = {455},
  rights = {© 2008 Nature Publishing Group},
  issn = {0028-0836},
  doi = {10.1038/455481a},
  shorttitle = {Microbiology},
  abstract = {Ten years after the term metagenomics was coined, the approach continues to gather momentum. This culture-independent, molecular way of analysing environmental samples of cohabiting microbial populations has opened up fresh perspectives on microbiology.},
  timestamp = {2017-02-18T15:24:03Z},
  langid = {english},
  number = {7212},
  journaltitle = {Nature},
  shortjournal = {Nature},
  author = {Hugenholtz, Philip and Tyson, Gene W.},
  urldate = {2017-02-18},
  date = {2008-09-25},
  pages = {481--483}
}

@article{WommackMetagenomics2008,
  title = {Metagenomics: {{Read Length Matters}}},
  volume = {74},
  issn = {0099-2240, 1098-5336},
  doi = {10.1128/AEM.02181-07},
  shorttitle = {Metagenomics},
  abstract = {Obtaining an unbiased view of the phylogenetic composition and functional diversity within a microbial community is one central objective of metagenomic analysis. New technologies, such as 454 pyrosequencing, have dramatically reduced sequencing costs, to a level where metagenomic analysis may become a viable alternative to more-focused assessments of the phylogenetic (e.g., 16S rRNA genes) and functional diversity of microbial communities. To determine whether the short (∼100 to 200 bp) sequence reads obtained from pyrosequencing are appropriate for the phylogenetic and functional characterization of microbial communities, the results of BLAST and COG analyses were compared for long (∼750 bp) and randomly derived short reads from each of two microbial and one virioplankton metagenome libraries. Overall, BLASTX searches against the GenBank nr database found far fewer homologs within the short-sequence libraries. This was especially pronounced for a Chesapeake Bay virioplankton metagenome library. Increasing the short-read sampling depth or the length of derived short reads (up to 400 bp) did not completely resolve the discrepancy in BLASTX homolog detection. Only in cases where the long-read sequence had a close homolog (low BLAST E-score) did the derived short-read sequence also find a significant homolog. Thus, more-distant homologs of microbial and viral genes are not detected by short-read sequences. Among COG hits, derived short reads sampled at a depth of two short reads per long read missed up to 72\% of the COG hits found using long reads. Noting the current limitation in computational approaches for the analysis of short sequences, the use of short-read-length libraries does not appear to be an appropriate tool for the metagenomic characterization of microbial communities.},
  timestamp = {2017-02-18T15:27:21Z},
  langid = {english},
  number = {5},
  journaltitle = {Applied and Environmental Microbiology},
  shortjournal = {Appl. Environ. Microbiol.},
  author = {Wommack, K. Eric and Bhavsar, Jaysheel and Ravel, Jacques},
  urldate = {2017-02-18},
  date = {2008-01-03},
  pages = {1453--1463},
  eprinttype = {pmid},
  eprint = {18192407}
}

@article{DohmSubstantial2008,
  title = {Substantial Biases in Ultra-Short Read Data Sets from High-Throughput {{DNA}} Sequencing},
  volume = {36},
  issn = {0305-1048},
  doi = {10.1093/nar/gkn425},
  abstract = {Novel sequencing technologies permit the rapid production of large sequence data sets. These technologies are likely to revolutionize genetics and biomedical research, but a thorough characterization of the ultra-short read output is necessary. We generated and analyzed two Illumina 1G ultra-short read data sets, i.e. 2.8 million 27mer reads from a Beta vulgaris genomic clone and 12.3 million 36mers from the Helicobacter acinonychis genome. We found that error rates range from 0.3\% at the beginning of reads to 3.8\% at the end of reads. Wrong base calls are frequently preceded by base G. Base substitution error frequencies vary by 10- to 11-fold, with A $>$ C transversion being among the most frequent and C $>$ G transversions among the least frequent substitution errors. Insertions and deletions of single bases occur at very low rates. When simulating re-sequencing we found a 20-fold sequencing coverage to be sufficient to compensate errors by correct reads. The read coverage of the sequenced regions is biased; the highest read density was found in intervals with elevated GC content. High Solexa quality scores are over-optimistic and low scores underestimate the data quality. Our results show different types of biases and ways to detect them. Such biases have implications on the use and interpretation of Solexa data, for de novo sequencing, re-sequencing, the identification of single nucleotide polymorphisms and DNA methylation sites, as well as for transcriptome analysis.},
  timestamp = {2017-02-18T15:37:48Z},
  number = {16},
  journaltitle = {Nucleic Acids Research},
  shortjournal = {Nucleic Acids Res},
  author = {Dohm, Juliane C. and Lottaz, Claudio and Borodina, Tatiana and Himmelbauer, Heinz},
  urldate = {2017-02-18},
  date = {2008-09},
  pages = {e105},
  eprinttype = {pmid},
  eprint = {18660515},
  pmcid = {PMC2532726}
}

@article{KuninWrinkles2010,
  title = {Wrinkles in the Rare Biosphere: Pyrosequencing Errors Can Lead to Artificial Inflation of Diversity Estimates},
  volume = {12},
  issn = {1462-2920},
  doi = {10.1111/j.1462-2920.2009.02051.x},
  shorttitle = {Wrinkles in the Rare Biosphere},
  abstract = {Massively parallel pyrosequencing of the small subunit (16S) ribosomal RNA gene has revealed that the extent of rare microbial populations in several environments, the ‘rare biosphere’, is orders of magnitude higher than previously thought. One important caveat with this method is that sequencing error could artificially inflate diversity estimates. Although the per-base error of 16S rDNA amplicon pyrosequencing has been shown to be as good as or lower than Sanger sequencing, no direct assessments of pyrosequencing errors on diversity estimates have been reported. Using only Escherichia coli MG1655 as a reference template, we find that 16S rDNA diversity is grossly overestimated unless relatively stringent read quality filtering and low clustering thresholds are applied. In particular, the common practice of removing reads with unresolved bases and anomalous read lengths is insufficient to ensure accurate estimates of microbial diversity. Furthermore, common and reproducible homopolymer length errors can result in relatively abundant spurious phylotypes further confounding data interpretation. We suggest that stringent quality-based trimming of 16S pyrotags and clustering thresholds no greater than 97\% identity should be used to avoid overestimates of the rare biosphere.},
  timestamp = {2017-02-18T15:47:11Z},
  langid = {english},
  number = {1},
  journaltitle = {Environmental Microbiology},
  author = {Kunin, Victor and Engelbrektson, Anna and Ochman, Howard and Hugenholtz, Philip},
  urldate = {2017-02-18},
  date = {2010-01-01},
  pages = {118--123}
}

@article{GhuryeMetagenomic2016,
  title = {Metagenomic {{Assembly}}: {{Overview}}, {{Challenges}} and {{Applications}}},
  volume = {89},
  issn = {0044-0086},
  shorttitle = {Metagenomic {{Assembly}}},
  abstract = {Advances in sequencing technologies have led to the increased use of high throughput sequencing in characterizing the microbial communities associated with our bodies and our environment. Critical to the analysis of the resulting data are sequence assembly algorithms able to reconstruct genes and organisms from complex mixtures. Metagenomic assembly involves new computational challenges due to the specific characteristics of the metagenomic data. In this survey, we focus on major algorithmic approaches for genome and metagenome assembly, and discuss the new challenges and opportunities afforded by this new field. We also review several applications of metagenome assembly in addressing interesting biological problems.},
  timestamp = {2017-02-18T15:50:56Z},
  number = {3},
  journaltitle = {The Yale Journal of Biology and Medicine},
  shortjournal = {Yale J Biol Med},
  author = {Ghurye, Jay S. and Cepeda-Espinoza, Victoria and Pop, Mihai},
  urldate = {2017-02-18},
  date = {2016-09-30},
  pages = {353--362},
  eprinttype = {pmid},
  eprint = {27698619},
  pmcid = {PMC5045144}
}

@article{HuangCap31999,
  title = {{{CAP3}}: {{A DNA Sequence Assembly Program}}},
  volume = {9},
  issn = {1088-9051, 1549-5469},
  doi = {10.1101/gr.9.9.868},
  shorttitle = {{{CAP3}}},
  abstract = {We describe the third generation of the CAP sequence assembly program. The CAP3 program includes a number of improvements and new features. The program has a capability to clip 5′ and 3′ low-quality regions of reads. It uses base quality values in computation of overlaps between reads, construction of multiple sequence alignments of reads, and generation of consensus sequences. The program also uses forward–reverse constraints to correct assembly errors and link contigs. Results of CAP3 on four BAC data sets are presented. The performance of CAP3 was compared with that of PHRAP on a number of BAC data sets. PHRAP often produces longer contigs than CAP3 whereas CAP3 often produces fewer errors in consensus sequences than PHRAP. It is easier to construct scaffolds with CAP3 than with PHRAP on low-pass data with forward–reverse constraints.},
  timestamp = {2017-02-18T15:58:34Z},
  langid = {english},
  number = {9},
  journaltitle = {Genome Research},
  shortjournal = {Genome Res.},
  author = {Huang, Xiaoqiu and Madan, Anup},
  urldate = {2017-02-18},
  date = {1999-01-09},
  pages = {868--877},
  eprinttype = {pmid},
  eprint = {10508846}
}

@article{SuttonTigr1995,
  title = {{{TIGR Assembler}}: {{A New Tool}} for {{Assembling Large Shotgun Sequencing Projects}}},
  volume = {1},
  issn = {1070-2830},
  doi = {10.1089/gst.1995.1.9},
  shorttitle = {{{TIGR Assembler}}},
  abstract = {A new approach to assembling large, random shotgun sequencing projects has been developed. The TIGR Assembler overcomes several major obstacles to assembling such projects: the large number of pairwise comparisons required, the presence of repeat regions, chimeras introduced in the cloning process, and sequencing errors. A fast initial comparison of fragments based on oligonucleotide content is used to eliminate the need for a more sensitive comparison between most fragment pairs, thus greatly reducing computer search time. Potential repeat regions are recognized by determining which fragments have more potential overlaps than expected given a random distribution of fragments. Repeat regions are dealt with by increasing the match criteria stringency and by assembling these regions last so that maximum information from nonrepeat regions can be used. The algorithm also incorporates a number of constraints, such as clone length and the placement of sequences from the opposite ends of a clone. TIGR Assembler has been used to assemble the complete 1.8 Mbp Haemophilus influenzae (Fleischmann et al., 1995) and 0.58 Mbp Mycoplasma genitalium (Fraser et al., 1995) genomes.},
  timestamp = {2017-02-18T15:58:44Z},
  number = {1},
  journaltitle = {Genome Science and Technology},
  shortjournal = {Genome Science and Technology},
  author = {Sutton, Granger G. and White, Owen and Adams, Mark D. and Kerlavage, Anthony R.},
  urldate = {2017-02-18},
  date = {1995-01-01},
  pages = {9--19}
}

@article{TurnbaughInvitation2008,
  title = {An {{Invitation}} to the {{Marriage}} of {{Metagenomics}} and {{Metabolomics}}},
  volume = {134},
  issn = {0092-8674},
  doi = {10.1016/j.cell.2008.08.025},
  abstract = {Metagenomics seeks to characterize the composition of microbial communities, their operations, and their dynamically coevolving relationships with the habitats they occupy without having to culture community members. Uniting metagenomics with analyses of the products of microbial community metabolism (metabolomics) will shed light on how microbial communities function in a variety of environments, including the human body.},
  timestamp = {2017-02-18T16:04:21Z},
  number = {5},
  journaltitle = {Cell},
  shortjournal = {Cell},
  author = {Turnbaugh, Peter J. and Gordon, Jeffrey I.},
  urldate = {2017-02-18},
  date = {2008-09-05},
  pages = {708--713}
}

@article{AguiarpulidoMetagenomics2016,
  title = {Metagenomics, {{Metatranscriptomics}}, and {{Metabolomics Approaches}} for {{Microbiome Analysis}}},
  volume = {12},
  issn = {1176-9343},
  doi = {10.4137/EBO.S36436},
  abstract = {Microbiomes are ubiquitous and are found in the ocean, the soil, and in/on other living organisms. Changes in the microbiome can impact the health of the environmental niche in which they reside. In order to learn more about these communities, different approaches based on data from multiple omics have been pursued. Metagenomics produces a taxonomical profile of the sample, metatranscriptomics helps us to obtain a functional profile, and metabolomics completes the picture by determining which byproducts are being released into the environment. Although each approach provides valuable information separately, we show that, when combined, they paint a more comprehensive picture. We conclude with a review of network-based approaches as applied to integrative studies, which we believe holds the key to in-depth understanding of microbiomes.},
  timestamp = {2017-02-18T16:10:23Z},
  issue = {Suppl 1},
  journaltitle = {Evolutionary Bioinformatics Online},
  shortjournal = {Evol Bioinform Online},
  author = {Aguiar-Pulido, Vanessa and Huang, Wenrui and Suarez-Ulloa, Victoria and Cickovski, Trevor and Mathee, Kalai and Narasimhan, Giri},
  urldate = {2017-02-18},
  date = {2016-05-12},
  pages = {5--16},
  eprinttype = {pmid},
  eprint = {27199545},
  pmcid = {PMC4869604}
}

@article{UfarteDiscovery2015,
  title = {Discovery of New Protein Families and Functions: New Challenges in Functional Metagenomics for Biotechnologies and Microbial Ecology},
  volume = {6},
  issn = {1664-302X},
  doi = {10.3389/fmicb.2015.00563},
  shorttitle = {Discovery of New Protein Families and Functions},
  abstract = {The rapid expansion of new sequencing technologies has enabled large-scale functional exploration of numerous microbial ecosystems, by establishing catalogues of functional genes and by comparing their prevalence in various microbiota. However, sequence similarity does not necessarily reflect functional conservation, since just a few modifications in a gene sequence can have a strong impact on the activity and the specificity of the corresponding enzyme or the recognition for a sensor. Similarly, some microorganisms harbor certain identified functions yet do not have the expected related genes in their genome. Finally, there are simply too many protein families whose function is not yet known, even though they are highly abundant in certain ecosystems. In this context, the discovery of new protein functions, using either sequence-based or activity-based approaches, is of crucial importance for the discovery of new enzymes and for improving the quality of annotation in public databases. This paper lists and explores the latest advances in this field, along with the challenges to be addressed, particularly where microfluidic technologies are concerned.},
  timestamp = {2017-02-18T19:09:05Z},
  langid = {english},
  journaltitle = {Frontiers in Microbiology},
  shortjournal = {Front. Microbiol.},
  author = {Ufarté, Lisa and Potocki-Veronese, Gabrielle and Laville, Élisabeth},
  urldate = {2017-02-18},
  date = {2015},
  keywords = {Biotechnologies,discovery of new functions,High throughput screening,Metagenomics,Microbial ecology,Microbial ecosystems,Proteins}
}

@article{UlyantsevMetafast2016,
  title = {{{MetaFast}}: Fast Reference-Free Graph-Based Comparison of Shotgun Metagenomic Data},
  volume = {32},
  issn = {1367-4803},
  doi = {10.1093/bioinformatics/btw312},
  shorttitle = {{{MetaFast}}},
  timestamp = {2017-02-18T19:51:13Z},
  number = {18},
  journaltitle = {Bioinformatics},
  shortjournal = {Bioinformatics},
  author = {Ulyantsev, Vladimir I. and Kazakov, Sergey V. and Dubinkina, Veronika B. and Tyakht, Alexander V. and Alexeev, Dmitry G.},
  urldate = {2017-02-18},
  date = {2016-09-15},
  pages = {2760--2767}
}

@article{VinhTwophase2015,
  title = {A Two-Phase Binning Algorithm Using l-Mer Frequency on Groups of Non-Overlapping Reads},
  volume = {10},
  issn = {1748-7188},
  doi = {10.1186/s13015-014-0030-4},
  abstract = {Metagenomics is the study of genetic materials derived directly from complex microbial samples, instead of from culture. One of the crucial steps in metagenomic analysis, referred to as “binning”, is to separate reads into clusters that represent genomes from closely related organisms. Among the existing binning methods, unsupervised methods base the classification on features extracted from reads, and especially taking advantage in case of the limitation of reference database availability. However, their performance, under various aspects, is still being investigated by recent theoretical and empirical studies. The one addressed in this paper is among those efforts to enhance the accuracy of the classification.},
  timestamp = {2017-02-18T19:59:30Z},
  journaltitle = {Algorithms for Molecular Biology},
  shortjournal = {Algorithms for Molecular Biology},
  author = {Vinh, Le Van and Lang, Tran Van and Binh, Le Thanh and Hoai, Tran Van},
  urldate = {2017-02-18},
  date = {2015},
  pages = {2},
  keywords = {Algorithm,Binning,l-mers frequency,Metagenomics,Next-generation sequencing}
}

@article{LaskenRecent2014,
  title = {Recent Advances in Genomic {{DNA}} Sequencing of Microbial Species from Single Cells},
  volume = {15},
  rights = {© 2014 Nature Publishing Group, a division of Macmillan Publishers Limited. All Rights Reserved.},
  issn = {1471-0056},
  doi = {10.1038/nrg3785},
  abstract = {The vast majority of microbial species remain uncultivated and, until recently, about half of all known bacterial phyla were identified only from their 16S ribosomal RNA gene sequence. With the advent of single-cell sequencing, genomes of uncultivated species are rapidly filling in unsequenced branches of the microbial phylogenetic tree. The wealth of new insights gained from these previously inaccessible groups is providing a deeper understanding of their basic biology, taxonomy and evolution, as well as their diverse roles in environmental ecosystems and human health.
View full text},
  timestamp = {2017-02-18T20:08:01Z},
  langid = {english},
  number = {9},
  journaltitle = {Nature Reviews Genetics},
  shortjournal = {Nat Rev Genet},
  author = {Lasken, Roger S. and McLean, Jeffrey S.},
  urldate = {2017-02-18},
  date = {2014-09},
  pages = {577--584},
  keywords = {Bacteria,Genome assembly algorithms,Metagenomics,Next-generation sequencing}
}

@article{MelstedKmerstream2014,
  title = {{{KmerStream}}: Streaming Algorithms for k-Mer Abundance Estimation},
  volume = {30},
  issn = {1367-4803, 1460-2059},
  doi = {10.1093/bioinformatics/btu713},
  shorttitle = {{{KmerStream}}},
  timestamp = {2017-02-20T16:41:08Z},
  langid = {english},
  number = {24},
  journaltitle = {Bioinformatics},
  author = {Melsted, P. and Halldorsson, B. V.},
  urldate = {2017-02-20},
  date = {2014-12-15},
  pages = {3541--3547}
}

@article{PellScaling2012a,
  title = {Scaling Metagenome Sequence Assembly with Probabilistic de {{Bruijn}} Graphs},
  volume = {109},
  timestamp = {2017-02-20T16:49:29Z},
  number = {33},
  journaltitle = {Proceedings of the National Academy of Sciences},
  author = {Pell, Jason and Hintze, Arend and Canino-Koning, Rosangela and Howe, Adina and Tiedje, James M. and Brown, C. Titus},
  urldate = {2017-02-20},
  date = {2012},
  pages = {13272--13277}
}

@article{MelstedEfficient2011,
  title = {Efficient Counting of K-Mers in {{DNA}} Sequences Using a Bloom Filter},
  volume = {12},
  issn = {1471-2105},
  doi = {10.1186/1471-2105-12-333},
  abstract = {Counting k-mers (substrings of length k in DNA sequence data) is an essential component of many methods in bioinformatics, including for genome and transcriptome assembly, for metagenomic sequencing, and for error correction of sequence reads. Although simple in principle, counting k-mers in large modern sequence data sets can easily overwhelm the memory capacity of standard computers. In current data sets, a large fraction-often more than 50\%-of the storage capacity may be spent on storing k-mers that contain sequencing errors and which are typically observed only a single time in the data. These singleton k-mers are uninformative for many algorithms without some kind of error correction.},
  timestamp = {2017-02-20T16:50:32Z},
  journaltitle = {BMC Bioinformatics},
  shortjournal = {BMC Bioinformatics},
  author = {Melsted, Páll and Pritchard, Jonathan K.},
  urldate = {2017-02-20},
  date = {2011},
  pages = {333}
}

@article{StrousBinning2012,
  title = {The {{Binning}} of {{Metagenomic Contigs}} for {{Microbial Physiology}} of {{Mixed Cultures}}},
  volume = {3},
  issn = {1664-302X},
  doi = {10.3389/fmicb.2012.00410},
  abstract = {So far, microbial physiology has dedicated itself mainly to pure cultures. In nature, cross feeding and competition are important aspects of microbial physiology and these can only be addressed by studying complete communities such as enrichment cultures. Metagenomic sequencing is a powerful tool to characterize such mixed cultures. In the analysis of metagenomic data, well established algorithms exist for the assembly of short reads into contigs and for the annotation of predicted genes. However, the binning of the assembled contigs or unassembled reads is still a major bottleneck and required to understand how the overall metabolism is partitioned over different community members. Binning consists of the clustering of contigs or reads that apparently originate from the same source population. In the present study eight metagenomic samples originating from the same habitat, a laboratory enrichment culture, were sequenced. Each sample contained 13-23 Mb of assembled contigs and up to eight abundant populations. Binning was attempted with existing methods but they were found to produce poor results, were slow, dependent on non-standard platforms or produced errors. A new binning procedure was developed based on multivariate statistics of tetranucleotide frequencies combined with the use of interpolated Markov models. Its performance was evaluated by comparison of the results between samples with BLAST and in comparison to exisiting algorithms for four publicly available metagenomes and one previously published artificial metagenome. The accuracy of the new approach was comparable or higher than existing methods. Further, it was up to a hunderd times faster. It was implemented in Java Swing as a complete open source graphical binning application available for download and further development (http://sourceforge.net/projects/metawatt).},
  timestamp = {2017-02-20T23:14:36Z},
  langid = {english},
  journaltitle = {Frontiers in Microbiology},
  shortjournal = {Front. Microbiol.},
  author = {Strous, Marc and Kraft, Beate and Bisdorf, Regina and Tegetmeyer, Halina},
  urldate = {2017-02-20},
  date = {2012},
  keywords = {Binning,enrichment culture,Interpolated Markov Models,Metagenomics,Tetranucleotide frequencies}
}

@article{SimonMetagenomic2011,
  title = {Metagenomic {{Analyses}}: {{Past}} and {{Future Trends}}},
  volume = {77},
  issn = {0099-2240},
  doi = {10.1128/AEM.02345-10},
  shorttitle = {Metagenomic {{Analyses}}},
  timestamp = {2017-02-21T14:31:43Z},
  langid = {english},
  number = {4},
  journaltitle = {Applied and Environmental Microbiology},
  author = {Simon, C. and Daniel, R.},
  urldate = {2017-02-21},
  date = {2011-02-15},
  pages = {1153--1161}
}

@article{ColeRibosomal2009,
  title = {The {{Ribosomal Database Project}}: Improved Alignments and New Tools for {{rRNA}} Analysis},
  volume = {37},
  issn = {0305-1048, 1362-4962},
  doi = {10.1093/nar/gkn879},
  shorttitle = {The {{Ribosomal Database Project}}},
  timestamp = {2017-02-21T14:49:00Z},
  langid = {english},
  issue = {Database},
  journaltitle = {Nucleic Acids Research},
  author = {Cole, J. R. and Wang, Q. and Cardenas, E. and Fish, J. and Chai, B. and Farris, R. J. and Kulam-Syed-Mohideen, A. S. and McGarrell, D. M. and Marsh, T. and Garrity, G. M. and Tiedje, J. M.},
  urldate = {2017-02-21},
  date = {2009-01-01},
  pages = {D141--D145}
}

@article{MarkowitzImg2012,
  title = {{{IMG}}/{{M}}: The Integrated Metagenome Data Management and Comparative Analysis System},
  volume = {40},
  issn = {0305-1048, 1362-4962},
  doi = {10.1093/nar/gkr975},
  shorttitle = {{{IMG}}/{{M}}},
  timestamp = {2017-02-21T15:03:37Z},
  langid = {english},
  issue = {D1},
  journaltitle = {Nucleic Acids Research},
  author = {Markowitz, V. M. and Chen, I.-M. A. and Chu, K. and Szeto, E. and Palaniappan, K. and Grechkin, Y. and Ratner, A. and Jacob, B. and Pati, A. and Huntemann, M. and Liolios, K. and Pagani, I. and Anderson, I. and Mavromatis, K. and Ivanova, N. N. and Kyrpides, N. C.},
  urldate = {2017-02-21},
  date = {2012-01-01},
  pages = {D123--D129}
}

@article{SunCommunity2011,
  title = {Community Cyberinfrastructure for {{Advanced Microbial Ecology Research}} and {{Analysis}}: The {{CAMERA}} Resource},
  volume = {39},
  issn = {0305-1048, 1362-4962},
  doi = {10.1093/nar/gkq1102},
  shorttitle = {Community Cyberinfrastructure for {{Advanced Microbial Ecology Research}} and {{Analysis}}},
  timestamp = {2017-02-21T15:07:08Z},
  langid = {english},
  issue = {Database},
  journaltitle = {Nucleic Acids Research},
  author = {Sun, S. and Chen, J. and Li, W. and Altintas, I. and Lin, A. and Peltier, S. and Stocks, K. and Allen, E. E. and Ellisman, M. and Grethe, J. and Wooley, J.},
  urldate = {2017-02-21},
  date = {2011-01-01},
  pages = {D546--D551}
}

@inproceedings{ChatterjiCompostbin2008,
  title = {{{CompostBin}}: {{A DNA}} Composition-Based Algorithm for Binning Environmental Shotgun Reads},
  shorttitle = {{{CompostBin}}},
  timestamp = {2017-02-21T15:45:30Z},
  booktitle = {Annual {{International Conference}} on {{Research}} in {{Computational Molecular Biology}}},
  publisher = {{Springer}},
  author = {Chatterji, Sourav and Yamazaki, Ichitaro and Bai, Zhaojun and Eisen, Jonathan A.},
  urldate = {2017-02-21},
  date = {2008},
  pages = {17--28}
}

@article{KimCentrifuge2016,
  title = {Centrifuge: Rapid and Sensitive Classification of Metagenomic Sequences},
  volume = {26},
  issn = {1088-9051, 1549-5469},
  doi = {10.1101/gr.210641.116},
  shorttitle = {Centrifuge},
  abstract = {Centrifuge is a novel microbial classification engine that enables rapid, accurate, and sensitive labeling of reads and quantification of species on desktop computers. The system uses an indexing scheme based on the Burrows-Wheeler transform (BWT) and the Ferragina-Manzini (FM) index, optimized specifically for the metagenomic classification problem. Centrifuge requires a relatively small index (4.2 GB for 4078 bacterial and 200 archaeal genomes) and classifies sequences at very high speed, allowing it to process the millions of reads from a typical high-throughput DNA sequencing run within a few minutes. Together, these advances enable timely and accurate analysis of large metagenomics data sets on conventional desktop computers. Because of its space-optimized indexing schemes, Centrifuge also makes it possible to index the entire NCBI nonredundant nucleotide sequence database (a total of 109 billion bases) with an index size of 69 GB, in contrast to k-mer-based indexing schemes, which require far more extensive space.},
  timestamp = {2017-03-02T09:37:55Z},
  langid = {english},
  number = {12},
  journaltitle = {Genome Research},
  shortjournal = {Genome Res.},
  author = {Kim, Daehwan and Song, Li and Breitwieser, Florian P. and Salzberg, Steven L.},
  urldate = {2017-03-02},
  date = {2016-01-12},
  pages = {1721--1729},
  eprinttype = {pmid},
  eprint = {27852649}
}

@article{SczyrbaCritical2017,
  title = {Critical {{Assessment}} of {{Metagenome Interpretation}} − a Benchmark of Computational Metagenomics Software},
  rights = {© 2017, Posted by Cold Spring Harbor Laboratory. This pre-print is available under a Creative Commons License (Attribution 4.0 International), CC BY 4.0, as described at http://creativecommons.org/licenses/by/4.0/},
  doi = {10.1101/099127},
  abstract = {In metagenome analysis, computational methods for assembly, taxonomic profiling and binning are key components facilitating downstream biological data interpretation. However, a lack of consensus about benchmarking datasets and evaluation metrics complicates proper performance assessment. The Critical Assessment of Metagenome Interpretation (CAMI) challenge has engaged the global developer community to benchmark their programs on datasets of unprecedented complexity and realism. Benchmark metagenomes were generated from newly sequenced \textasciitilde{}700 microorganisms and \textasciitilde{}600 novel viruses and plasmids, including genomes with varying degrees of relatedness to each other and to publicly available ones and representing common experimental setups. Across all datasets, assembly and genome binning programs performed well for species represented by individual genomes, while performance was substantially affected by the presence of related strains. Taxonomic profiling and binning programs were proficient at high taxonomic ranks, with a notable performance decrease below the family level. Parameter settings substantially impacted performances, underscoring the importance of program reproducibility. While highlighting current challenges in computational metagenomics, the CAMI results provide a roadmap for software selection to answer specific research questions.},
  timestamp = {2017-03-10T11:25:55Z},
  langid = {english},
  journaltitle = {bioRxiv},
  author = {Sczyrba, Alexander and Hofmann, Peter and Belmann, Peter and Koslicki, David and Janssen, Stefan and Droege, Johannes and Gregor, Ivan and Majda, Stephan and Fiedler, Jessika and Dahms, Eik and Bremges, Andreas and Fritz, Adrian and Garrido-Oter, Ruben and Jorgensen, Tue Sparholt and Shapiro, Nicole and Blood, Philip D. and Gurevich, Alexey and Bai, Yang and Turaev, Dmitrij and DeMaere, Matthew Z. and Chikhi, Rayan and Nagarajan, Niranjan and Quince, Christopher and Hansen, Lars Hestbjerg and Sorensen, Soren J. and Chia, Burton K. H. and Denis, Bertrand and Froula, Jeff L. and Wang, Zhong and Egan, Robert and Kang, Dongwan Don and Cook, Jeffrey J. and Deltel, Charles and Beckstette, Michael and Lemaitre, Claire and Peterlongo, Pierre and Rizk, Guillaume and Lavenier, Dominique and Wu, Yu-Wei and Singer, Steven W. and Jain, Chirag and Strous, Marc and Klingenberg, Heiner and Meinicke, Peter and Barton, Michael and Lingner, Thomas and Lin, Hsin-Hung and Liao, Yu-Chieh and Silva, Genivaldo Gueiros Z. and Cuevas, Daniel A. and Edwards, Robert A. and Saha, Surya and Piro, Vitor C. and Renard, Bernhard Y. and Pop, Mihai and Klenk, Hans-Peter and Goeker, Markus and Kyrpides, Nikos and Woyke, Tanja and Vorholt, Julia A. and Schulze-Lefert, Paul and Rubin, Edward M. and Darling, Aaron E. and Rattei, Thomas and McHardy, Alice C.},
  urldate = {2017-03-10},
  date = {2017-01-09},
  pages = {099127}
}

@article{RondonCloning2000,
  title = {Cloning the Soil Metagenome: A Strategy for Accessing the Genetic and Functional Diversity of Uncultured Microorganisms},
  volume = {66},
  shorttitle = {Cloning the Soil Metagenome},
  timestamp = {2017-03-13T22:56:00Z},
  number = {6},
  journaltitle = {Applied and environmental microbiology},
  author = {Rondon, Michelle R. and August, Paul R. and Bettermann, Alan D. and Brady, Sean F. and Grossman, Trudy H. and Liles, Mark R. and Loiacono, Kara A. and Lynch, Berkley A. and MacNeil, Ian A. and Minor, Charles and {others}},
  urldate = {2017-03-13},
  date = {2000},
  pages = {2541--2547}
}

@article{BelmannBioboxes2015,
  title = {Bioboxes: Standardised Containers for Interchangeable Bioinformatics Software},
  volume = {4},
  issn = {2047-217X},
  doi = {10.1186/s13742-015-0087-0},
  shorttitle = {Bioboxes},
  timestamp = {2017-03-28T07:38:36Z},
  langid = {english},
  number = {1},
  journaltitle = {GigaScience},
  author = {Belmann, Peter and Dröge, Johannes and Bremges, Andreas and McHardy, Alice C. and Sczyrba, Alexander and Barton, Michael D.},
  urldate = {2017-03-28},
  date = {2015-12}
}

@article{PriceGenomescale2004,
  title = {Genome-Scale Models of Microbial Cells: Evaluating the Consequences of Constraints},
  volume = {2},
  issn = {1740-1526, 1740-1534},
  doi = {10.1038/nrmicro1023},
  shorttitle = {Genome-Scale Models of Microbial Cells},
  timestamp = {2017-03-28T11:35:35Z},
  number = {11},
  journaltitle = {Nature Reviews Microbiology},
  author = {Price, Nathan D. and Reed, Jennifer L. and Palsson, Bernhard Ø.},
  urldate = {2017-03-28},
  date = {2004-11},
  pages = {886--897}
}

@article{YuMicrofluidicbased2017,
  title = {Microfluidic-Based Mini-Metagenomics Enables Discovery of Novel Microbial Lineages from Complex Environmental Samples},
  rights = {© 2017, Posted by Cold Spring Harbor Laboratory. The copyright holder for this pre-print is the author. All rights reserved. The material may not be redistributed, re-used or adapted without the author's permission.},
  doi = {10.1101/114496},
  abstract = {Metagenomics and single-cell genomics have enabled the discovery of many new genomes from previously unknown branches of life. However, extracting novel genomes from complex mixtures of metagenomic data can still be challenging and in many respects represents an ill-posed problem which is generally approached with ad hoc methods. Here we present a microfluidic-based mini-metagenomic method which offers a statistically rigorous approach to extract novel microbial genomes from complex samples. In addition, by generating 96 sub-samples from each environmental sample, this method maintains high throughput, reduces sample complexity, and preserves single-cell resolution. We used this approach to analyze two hot spring samples from Yellowstone National Park and extracted 29 new genomes larger than 0.5 Mbps. These genomes represent novel lineages at different taxonomic levels, including three deeply branching lineages. Functional analysis revealed that these organisms utilize diverse pathways for energy metabolism. The resolution of this mini-metagenomic method enabled accurate quantification of genome abundance, even for genomes less than 1\% in relative abundance. Our analyses also revealed a wide range of genome level single nucleotide polymorphism (SNP) distributions with nonsynonymous to synonymous ratio indicative of low to moderate environmental selection. The scale, resolution, and statistical power of microfluidic-based mini-metagenomic make it a powerful tool to dissect the genomic structure microbial communities while effectively preserving the fundamental unit of biology, the single cell.},
  timestamp = {2017-03-28T12:02:28Z},
  langid = {english},
  journaltitle = {bioRxiv},
  author = {Yu, Feiqiao and Blainey, Paul C. and Schulz, Frederik and Woyke, Tanja and Horowitz, Mark A. and Quake, Stephen R.},
  urldate = {2017-03-28},
  date = {2017-03-07},
  pages = {114496}
}

@book{HastieElements2001,
  title = {The {{Elements}} of {{Statistical Learning}}},
  timestamp = {2017-03-28T12:36:53Z},
  series = {Springer Series in Statistics},
  publisher = {{Springer New York Inc.}},
  author = {Hastie, Trevor and Tibshirani, Robert and Friedman, Jerome},
  date = {2001},
  place = {New York, NY, USA},
  collection = {Springer Series in Statistics}
}

@article{PonomarovaMetabolic2015,
  title = {Metabolic Interactions in Microbial Communities: Untangling the {{Gordian}} Knot},
  volume = {27},
  issn = {1369-5274},
  doi = {10.1016/j.mib.2015.06.014},
  shorttitle = {Metabolic Interactions in Microbial Communities},
  abstract = {Metabolic exchanges are ubiquitous in microbial communities. However, detecting metabolite cross-feedings is difficult due to their intrinsically dynamic nature and the complexity of communities. Thus, while exhaustive description of metabolic networks operating in natural systems is a task for the future, the battle of today is divided between detailed characterizations of small, reduced complexity microbial consortia, and focusing on particular metabolic aspects of natural ecosystems. Detecting metabolic interactions requires methodological blend able to capture species identity, dependencies and the nature of exchanged metabolites. Multiple combinations of diverse techniques, from metagenomics to imaging mass spectrometry, offer solutions to this challenge, each combination being tailored to the community at hand.},
  timestamp = {2017-03-29T12:33:16Z},
  journaltitle = {Current Opinion in Microbiology},
  shortjournal = {Current Opinion in Microbiology},
  series = {Antimicrobials • Microbial systems biology},
  author = {Ponomarova, Olga and Patil, Kiran Raosaheb},
  urldate = {2017-03-29},
  date = {2015-10},
  pages = {37--44}
}

@article{KuninBioinformatician2008,
  title = {A {{Bioinformatician}}'s {{Guide}} to {{Metagenomics}}},
  volume = {72},
  issn = {1092-2172},
  doi = {10.1128/MMBR.00009-08},
  timestamp = {2017-02-21T14:34:01Z},
  langid = {english},
  number = {4},
  journaltitle = {Microbiology and Molecular Biology Reviews},
  author = {Kunin, V. and Copeland, A. and Lapidus, A. and Mavromatis, K. and Hugenholtz, P.},
  urldate = {2017-02-21},
  date = {2008-12-01},
  pages = {557--578}
}

@article{DrogeProbabilistic2017,
  title = {A Probabilistic Model to Recover Individual Genomes from Metagenomes},
  volume = {3},
  issn = {2376-5992},
  doi = {10.7717/peerj-cs.117},
  abstract = {Shotgun metagenomics of microbial communities reveal information about strains of relevance for applications in medicine, biotechnology and ecology. Recovering their genomes is a crucial but very challenging step due to the complexity of the underlying biological system and technical factors. Microbial communities are heterogeneous, with oftentimes hundreds of present genomes deriving from different species or strains, all at varying abundances and with different degrees of similarity to each other and reference data. We present a versatile probabilistic model for genome recovery and analysis, which aggregates three types of information that are commonly used for genome recovery from metagenomes. As potential applications we showcase metagenome contig classification, genome sample enrichment and genome bin comparisons. The open source implementation MGLEX is available via the Python Package Index and on GitHub and can be embedded into metagenome analysis workflows and programs.},
  timestamp = {2017-10-26T09:22:53Z},
  langid = {english},
  journaltitle = {PeerJ Computer Science},
  shortjournal = {PeerJ Comput. Sci.},
  author = {Dröge, Johannes and Schönhuth, Alexander and McHardy, Alice C.},
  urldate = {2017-10-26},
  date = {2017-05-22},
  pages = {e117}
}

@preamble{ "\ifdefined\DeclarePrefChars\DeclarePrefChars{'’-}\else\fi " }