-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbibliography.bib
2559 lines (2400 loc) · 250 KB
/
bibliography.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@article{GregorPhylopythias2016,
title = {{{{\emph{PhyloPythiaS}}}}{\emph{+}}: A Self-Training Method for the Rapid Reconstruction of Low-Ranking Taxonomic Bins from Metagenomes},
volume = {4},
issn = {2167-8359},
doi = {10.7717/peerj.1603},
abstract = {\textbf{Background.} Metagenomics is an approach for characterizing environmental microbial communities \emph{in situ}, it allows their functional and taxonomic characterization and to recover sequences from uncultured taxa. This is often achieved by a combination of sequence assembly and binning, where sequences are grouped into ‘bins’ representing taxa of the underlying microbial community. Assignment to low-ranking taxonomic bins is an important challenge for binning methods as is scalability to Gb-sized datasets generated with deep sequencing techniques. One of the best available methods for species bins recovery from deep-branching phyla is the expert-trained \emph{PhyloPythiaS} package, where a human expert decides on the taxa to incorporate in the model and identifies ‘training’ sequences based on marker genes directly from the sample. Due to the manual effort involved, this approach does not scale to multiple metagenome samples and requires substantial expertise, which researchers who are new to the area do not have. \textbf{Results.} We have developed \emph{PhyloPythiaS+}, a successor to our \emph{PhyloPythia(S)} software. The new (+) component performs the work previously done by the human expert. \emph{PhyloPythiaS+} also includes a new \emph{k}-mer counting algorithm, which accelerated the simultaneous counting of 4–6-mers used for taxonomic binning 100-fold and reduced the overall execution time of the software by a factor of three. Our software allows to analyze Gb-sized metagenomes with inexpensive hardware, and to recover species or genera-level bins with low error rates in a fully automated fashion. \emph{PhyloPythiaS+} was compared to \emph{MEGAN}, \emph{taxator-tk}, \emph{Kraken} and the generic \emph{PhyloPythiaS} model. The results showed that \emph{PhyloPythiaS+} performs especially well for samples originating from novel environments in comparison to the other methods. \textbf{Availability.} \emph{PhyloPythiaS+} in a virtual machine is available for installation under Windows, Unix systems or OS X on: https://github.com/algbioi/ppsp/wiki.},
timestamp = {2016-06-17T09:22:44Z},
journaltitle = {PeerJ},
author = {Gregor, Ivan and Dröge, Johannes and Schirmer, Melanie and Quince, Christopher and McHardy, Alice C.},
date = {2016-02},
pages = {e1603},
keywords = {bioinformatics,machine learning,metagenomics,Taxonomic classification}
}
@article{LuCocacola2016,
title = {{{COCACOLA}}: Binning Metagenomic Contigs Using Sequence {{COmposition}}, Read {{CoverAge}}, {{CO}}-Alignment, and Paired-End Read {{LinkAge}}},
shorttitle = {{{COCACOLA}}},
timestamp = {2016-06-17T10:00:21Z},
journaltitle = {Bioinformatics},
author = {Lu, Yang Young and Chen, Ting and Fuhrman, Jed A. and Sun, Fengzhu},
urldate = {2016-06-17},
date = {2016},
pages = {btw290}
}
@article{AlbertsenGenome2013,
title = {Genome Sequences of Rare, Uncultured Bacteria Obtained by Differential Coverage Binning of Multiple Metagenomes.},
volume = {31},
issn = {1546-1696},
doi = {10.1038/nbt.2579},
abstract = {Reference genomes are required to understand the diverse roles of microorganisms in ecology, evolution, human and animal health, but most species remain uncultured. Here we present a sequence composition-independent approach to recover high-quality microbial genomes from deeply sequenced metagenomes. Multiple metagenomes of the same community, which differ in relative population abundances, were used to assemble 31 bacterial genomes, including rare ($\backslash$textless1\% relative abundance) species, from an activated sludge bioreactor. Twelve genomes were assembled into complete or near-complete chromosomes. Four belong to the candidate bacterial phylum TM7 and represent the most complete genomes for this phylum to date (relative abundances, 0.06-1.58\%). Reanalysis of published metagenomes reveals that differential coverage binning facilitates recovery of more complete and higher fidelity genome bins than other currently used methods, which are primarily based on sequence composition. This approach will be an important addition to the standard metagenome toolbox and greatly improve access to genomes of uncultured microorganisms.},
timestamp = {2017-02-17T20:48:16Z},
number = {6},
journaltitle = {Nature biotechnology},
author = {Albertsen, Mads and Hugenholtz, Philip and Skarshewski, Adam and Nielsen, K$\backslash$a are L and Tyson, Gene W and Nielsen, Per H},
date = {2013-06},
pages = {533--8},
keywords = {Animals,Bacteria,Bacteria: classification,Bacteria: genetics,Bacterial,Base Sequence,DNA,Genome,Humans,Metagenome,Metagenomics,Molecular Sequence Data,Sequence Analysis},
eprinttype = {pmid},
eprint = {23707974}
}
@article{BuchfinkFast2014,
title = {Fast and Sensitive Protein Alignment Using {{DIAMOND}}},
volume = {12},
issn = {1548-7091, 1548-7105},
doi = {10.1038/nmeth.3176},
timestamp = {2016-06-17T10:54:44Z},
number = {1},
journaltitle = {Nature Methods},
author = {Buchfink, Benjamin and Xie, Chao and Huson, Daniel H},
urldate = {2016-06-17},
date = {2014-11-17},
pages = {59--60}
}
@article{AmannPhylogenetic1995,
title = {Phylogenetic Identification and in Situ Detection of Individual Microbial Cells without Cultivation.},
volume = {59},
issn = {0146-0749},
abstract = {The frequent discrepancy between direct microscopic counts and numbers of culturable bacteria from environmental samples is just one of several indications that we currently know only a minor part of the diversity of microorganisms in nature. A combination of direct retrieval of rRNA sequences and whole-cell oligonucleotide probing can be used to detect specific rRNA sequences of uncultured bacteria in natural samples and to microscopically identify individual cells. Studies have been performed with microbial assemblages of various complexities ranging from simple two-component bacterial endosymbiotic associations to multispecies enrichments containing magnetotactic bacteria to highly complex marine and soil communities. Phylogenetic analysis of the retrieved rRNA sequence of an uncultured microorganism reveals its closest culturable relatives and may, together with information on the physicochemical conditions of its natural habitat, facilitate more directed cultivation attempts. For the analysis of complex communities such as multispecies biofilms and activated-sludge flocs, a different approach has proven advantageous. Sets of probes specific to different taxonomic levels are applied consecutively beginning with the more general and ending with the more specific (a hierarchical top-to-bottom approach), thereby generating increasingly precise information on the structure of the community. Not only do rRNA-targeted whole-cell hybridizations yield data on cell morphology, specific cell counts, and in situ distributions of defined phylogenetic groups, but also the strength of the hybridization signal reflects the cellular rRNA content of individual cells. From the signal strength conferred by a specific probe, in situ growth rates and activities of individual cells might be estimated for known species. In many ecosystems, low cellular rRNA content and/or limited cell permeability, combined with background fluorescence, hinders in situ identification of autochthonous populations. Approaches to circumvent these problems are discussed in detail.},
timestamp = {2016-06-16T16:07:15Z},
number = {1},
journaltitle = {Microbiological reviews},
author = {Amann, R I and Ludwig, W and Schleifer, K H},
date = {1995-03},
pages = {143--69},
keywords = {16S,16S: analysis,16S: genetics,23S,23S: analysis,23S: genetics,Bacteria,Bacteria: genetics,Bacteria: isolation & purification,Bacterial,Bacterial: genetics,Bacterial: isolation & purification,Base Sequence,Genetic Variation,In Situ Hybridization,In Situ Hybridization: methods,Molecular Sequence Data,Ribosomal,RNA},
eprinttype = {pmid},
eprint = {7535888}
}
@article{BaranJoint2012,
title = {Joint Analysis of Multiple Metagenomic Samples.},
volume = {8},
issn = {1553-7358},
doi = {10.1371/journal.pcbi.1002373},
abstract = {The availability of metagenomic sequencing data, generated by sequencing DNA pooled from multiple microbes living jointly, has increased sharply in the last few years with developments in sequencing technology. Characterizing the contents of metagenomic samples is a challenging task, which has been extensively attempted by both supervised and unsupervised techniques, each with its own limitations. Common to practically all the methods is the processing of single samples only; when multiple samples are sequenced, each is analyzed separately and the results are combined. In this paper we propose to perform a combined analysis of a set of samples in order to obtain a better characterization of each of the samples, and provide two applications of this principle. First, we use an unsupervised probabilistic mixture model to infer hidden components shared across metagenomic samples. We incorporate the model in a novel framework for studying association of microbial sequence elements with phenotypes, analogous to the genome-wide association studies performed on human genomes: We demonstrate that stratification may result in false discoveries of such associations, and that the components inferred by the model can be used to correct for this stratification. Second, we propose a novel read clustering (also termed "binning") algorithm which operates on multiple samples simultaneously, leveraging on the assumption that the different samples contain the same microbial species, possibly in different proportions. We show that integrating information across multiple samples yields more precise binning on each of the samples. Moreover, for both applications we demonstrate that given a fixed depth of coverage, the average per-sample performance generally increases with the number of sequenced samples as long as the per-sample coverage is high enough.},
timestamp = {2017-02-17T20:48:00Z},
number = {2},
journaltitle = {PLoS computational biology},
author = {Baran, Yael and Halperin, Eran},
date = {2012-02},
pages = {e1002373},
keywords = {Crohn's disease,DNA sequencing,Genome-wide association studies,Genomic databases,Genomic medicine,Metagenomics,Principal component analysis,Sequence alignment},
eprinttype = {pmid},
eprint = {22359490}
}
@article{BergerPerformance2011,
title = {Performance, Accuracy, and Web Server for Evolutionary Placement of Short Sequence Reads under Maximum Likelihood.},
volume = {60},
issn = {1076-836X},
doi = {10.1093/sysbio/syr010},
abstract = {We present an evolutionary placement algorithm (EPA) and a Web server for the rapid assignment of sequence fragments (short reads) to edges of a given phylogenetic tree under the maximum-likelihood model. The accuracy of the algorithm is evaluated on several real-world data sets and compared with placement by pair-wise sequence comparison, using edit distances and BLAST. We introduce a slow and accurate as well as a fast and less accurate placement algorithm. For the slow algorithm, we develop additional heuristic techniques that yield almost the same run times as the fast version with only a small loss of accuracy. When those additional heuristics are employed, the run time of the more accurate algorithm is comparable with that of a simple BLAST search for data sets with a high number of short query sequences. Moreover, the accuracy of the EPA is significantly higher, in particular when the sample of taxa in the reference topology is sparse or inadequate. Our algorithm, which has been integrated into RAxML, therefore provides an equally fast but more accurate alternative to BLAST for tree-based inference of the evolutionary origin and composition of short sequence reads. We are also actively developing a Web server that offers a freely available service for computing read placements on trees using the EPA.},
timestamp = {2016-06-16T16:07:17Z},
number = {3},
journaltitle = {Systematic biology},
author = {Berger, Simon A and Krompass, Denis and Stamatakis, Alexandros},
date = {2011-05},
pages = {291--302},
keywords = {Algorithms,Amino Acid Sequence,Base Sequence,Computer Simulation,DNA,DNA: methods,Evolution,Internet,Likelihood Functions,Molecular,phylogenetic placement algorithm,Phylogeny,Protein,Protein: methods,RNA,RNA: methods,Sequence Alignment,Sequence Alignment: methods,Sequence Analysis,Software,taxonomic binning},
eprinttype = {pmid},
eprint = {21436105}
}
@article{BradyPhymmbl2011,
title = {{{PhymmBL}} Expanded: Confidence Scores, Custom Databases, Parallelization and More.},
volume = {8},
issn = {1548-7105},
doi = {10.1038/nmeth0511-367},
timestamp = {2016-06-16T16:07:18Z},
number = {5},
journaltitle = {Nature methods},
author = {Brady, Arthur and Salzberg, Steven},
date = {2011-05},
pages = {367},
keywords = {Animals,Classification,Classification: methods,Databases,Elephants,Elephants: classification,Elephants: genetics,Mammoths,Mammoths: classification,Mammoths: genetics,metagenomics,Metagenomics: statistics & numerical data,Nucleic Acid,Phylogeny,Sequence Alignment,Sequence Alignment: statistics & numerical data,Software},
eprinttype = {pmid},
eprint = {21527926}
}
@article{BradyPhymm2009,
title = {Phymm and {{PhymmBL}}: Metagenomic Phylogenetic Classification with Interpolated {{Markov}} Models.},
volume = {6},
issn = {1548-7105},
doi = {10.1038/nmeth.1358},
abstract = {Metagenomics projects collect DNA from uncharacterized environments that may contain thousands of species per sample. One main challenge facing metagenomic analysis is phylogenetic classification of raw sequence reads into groups representing the same or similar taxa, a prerequisite for genome assembly and for analyzing the biological diversity of a sample. New sequencing technologies have made metagenomics easier, by making sequencing faster, and more difficult, by producing shorter reads than previous technologies. Classifying sequences from reads as short as 100 base pairs has until now been relatively inaccurate, requiring researchers to use older, long-read technologies. We present Phymm, a classifier for metagenomic data, that has been trained on 539 complete, curated genomes and can accurately classify reads as short as 100 base pairs, a substantial improvement over previous composition-based classification methods. We also describe how combining Phymm with sequence alignment algorithms improves accuracy.},
timestamp = {2016-06-16T16:07:18Z},
number = {9},
journaltitle = {Nature methods},
author = {Brady, Arthur and Salzberg, Steven L},
date = {2009-09},
pages = {673--6},
keywords = {Artificial Intelligence,Bacteria,Bacteria: classification,Bacteria: genetics,Base Sequence,DNA,DNA: classification,DNA: genetics,Genetic,Genomics,Genomics: methods,Hydrogen-Ion Concentration,Markov Chains,Mining,Models,Phylogeny,Sequence Alignment,Soil Microbiology},
eprinttype = {pmid},
eprint = {19648916}
}
@article{CamachoBlast2009,
title = {{{BLAST}}+: Architecture and Applications.},
volume = {10},
issn = {1471-2105},
doi = {10.1186/1471-2105-10-421},
abstract = {BACKGROUND: Sequence similarity searching is a very important bioinformatics task. While Basic Local Alignment Search Tool (BLAST) outperforms exact methods through its use of heuristics, the speed of the current BLAST software is suboptimal for very long queries or database sequences. There are also some shortcomings in the user-interface of the current command-line applications. RESULTS: We describe features and improvements of rewritten BLAST software and introduce new command-line applications. Long query sequences are broken into chunks for processing, in some cases leading to dramatically shorter run times. For long database sequences, it is possible to retrieve only the relevant parts of the sequence, reducing CPU time and memory usage for searches of short queries against databases of contigs or chromosomes. The program can now retrieve masking information for database sequences from the BLAST databases. A new modular software library can now access subject sequence data from arbitrary data sources. We introduce several new features, including strategy files that allow a user to save and reuse their favorite set of options. The strategy files can be uploaded to and downloaded from the NCBI BLAST web site. CONCLUSION: The new BLAST command-line applications, compared to the current BLAST tools, demonstrate substantial speed improvements for long queries as well as chromosome length database sequences. We have also improved the user interface of the command-line applications.},
timestamp = {2016-06-16T16:07:19Z},
journaltitle = {BMC bioinformatics},
author = {Camacho, Christiam and Coulouris, George and Avagyan, Vahram and Ma, Ning and Papadopoulos, Jason and Bealer, Kevin and Madden, Thomas L},
date = {2009-01},
pages = {421},
keywords = {Computational Biology,Computational Biology: methods,Databases,Genetic,Sequence Alignment,Software},
eprinttype = {pmid},
eprint = {20003500}
}
@article{CarrReconstructing2013,
title = {Reconstructing the Genomic Content of Microbiome Taxa through Shotgun Metagenomic Deconvolution.},
volume = {9},
issn = {1553-7358},
doi = {10.1371/journal.pcbi.1003292},
abstract = {Metagenomics has transformed our understanding of the microbial world, allowing researchers to bypass the need to isolate and culture individual taxa and to directly characterize both the taxonomic and gene compositions of environmental samples. However, associating the genes found in a metagenomic sample with the specific taxa of origin remains a critical challenge. Existing binning methods, based on nucleotide composition or alignment to reference genomes allow only a coarse-grained classification and rely heavily on the availability of sequenced genomes from closely related taxa. Here, we introduce a novel computational framework, integrating variation in gene abundances across multiple samples with taxonomic abundance data to deconvolve metagenomic samples into taxa-specific gene profiles and to reconstruct the genomic content of community members. This assembly-free method is not bounded by various factors limiting previously described methods of metagenomic binning or metagenomic assembly and represents a fundamentally different approach to metagenomic-based genome reconstruction. An implementation of this framework is available at http://elbo.gs.washington.edu/software.html. We first describe the mathematical foundations of our framework and discuss considerations for implementing its various components. We demonstrate the ability of this framework to accurately deconvolve a set of metagenomic samples and to recover the gene content of individual taxa using synthetic metagenomic samples. We specifically characterize determinants of prediction accuracy and examine the impact of annotation errors on the reconstructed genomes. We finally apply metagenomic deconvolution to samples from the Human Microbiome Project, successfully reconstructing genus-level genomic content of various microbial genera, based solely on variation in gene count. These reconstructed genera are shown to correctly capture genus-specific properties. With the accumulation of metagenomic data, this deconvolution framework provides an essential tool for characterizing microbial taxa never before seen, laying the foundation for addressing fundamental questions concerning the taxa comprising diverse microbial communities.},
timestamp = {2016-06-16T16:07:19Z},
number = {10},
journaltitle = {PLoS computational biology},
author = {Carr, Rogan and Shen-Orr, Shai S and Borenstein, Elhanan},
date = {2013-01},
pages = {e1003292},
eprinttype = {pmid},
eprint = {24146609}
}
@article{DarlingPhylosift2014,
title = {{{PhyloSift}}: Phylogenetic Analysis of Genomes and Metagenomes.},
volume = {2},
issn = {2167-8359},
doi = {10.7717/peerj.243},
abstract = {Like all organisms on the planet, environmental microbes are subject to the forces of molecular evolution. Metagenomic sequencing provides a means to access the DNA sequence of uncultured microbes. By combining DNA sequencing of microbial communities with evolutionary modeling and phylogenetic analysis we might obtain new insights into microbiology and also provide a basis for practical tools such as forensic pathogen detection. In this work we present an approach to leverage phylogenetic analysis of metagenomic sequence data to conduct several types of analysis. First, we present a method to conduct phylogeny-driven Bayesian hypothesis tests for the presence of an organism in a sample. Second, we present a means to compare community structure across a collection of many samples and develop direct associations between the abundance of certain organisms and sample metadata. Third, we apply new tools to analyze the phylogenetic diversity of microbial communities and again demonstrate how this can be associated to sample metadata. These analyses are implemented in an open source software pipeline called PhyloSift. As a pipeline, PhyloSift incorporates several other programs including LAST, HMMER, and pplacer to automate phylogenetic analysis of protein coding and RNA sequences in metagenomic datasets generated by modern sequencing platforms (e.g., Illumina, 454).},
timestamp = {2016-06-16T16:07:22Z},
journaltitle = {PeerJ},
author = {Darling, Aaron E. and Jospin, Guillaume and Lowe, Eric and a. Matsen, Frederick and Bik, Holly M. and a. Eisen, Jonathan},
date = {2014-01},
pages = {e243},
keywords = {accepted 19 december 2013,bayes factor,community,forensics,metagenomics,microbial diversity,phylogenetics,published 9 january 2014,submitted 21 march 2013},
eprinttype = {pmid},
eprint = {24482762}
}
@article{DesantisGreengenes2006,
title = {Greengenes, a Chimera-Checked {{16S rRNA}} Gene Database and Workbench Compatible with {{ARB}}.},
volume = {72},
issn = {0099-2240},
doi = {10.1128/AEM.03006-05},
abstract = {A 16S rRNA gene database (http://greengenes.lbl.gov) addresses limitations of public repositories by providing chimera screening, standard alignment, and taxonomic classification using multiple published taxonomies. It was found that there is incongruent taxonomic nomenclature among curators even at the phylum level. Putative chimeras were identified in 3\% of environmental sequences and in 0.2\% of records derived from isolates. Environmental sequences were classified into 100 phylum-level lineages in the Archaea and Bacteria.},
timestamp = {2016-06-16T16:07:23Z},
number = {7},
journaltitle = {Applied and environmental microbiology},
author = {DeSantis, T Z and Hugenholtz, P and Larsen, N and Rojas, M and Brodie, E L and Keller, K and Huber, T and Dalevi, D and Hu, P and Andersen, G L},
date = {2006-07},
pages = {5069--72},
keywords = {16S,16S: genetics,Archaea,Archaea: classification,Archaea: genetics,Bacteria,Bacteria: classification,Bacteria: genetics,Databases,Genes,Genetic,Nucleic Acid,Nucleic Acid: standards,Polymerase Chain Reaction,Recombination,Reproducibility of Results,Ribosomal,RNA,rRNA,Sequence Alignment,Software},
eprinttype = {pmid},
eprint = {16820507}
}
@article{DeschavanneGenomic1999,
title = {Genomic Signature: Characterization and Classification of Species Assessed by Chaos Game Representation of Sequences.},
volume = {16},
issn = {0737-4038},
abstract = {We explored DNA structures of genomes by means of a new tool derived from the "chaotic dynamical systems" theory (the so-called chaos game representation [CGR]), which allows the depiction of frequencies of oligonucleotides in the form of images. Using CGR, we observe that subsequences of a genome exhibit the main characteristics of the whole genome, attesting to the validity of the genomic signature concept. Base concentrations, stretches (runs of complementary bases or purines/pyrimidines), and patches (over- or underexpressed words of various lengths) are the main factors explaining the variability observed among sequences. The distance between images may be considered a measure of phylogenetic proximity. Eukaryotes and prokaryotes can be identified merely on the basis of their DNA structures.},
timestamp = {2016-06-16T16:07:23Z},
number = {10},
journaltitle = {Molecular biology and evolution},
author = {Deschavanne, P J and Giron, a and Vilain, J and Fagot, G and Fertil, B},
date = {1999-10},
pages = {1391--9},
keywords = {Algorithms,Animals,Classification,Computer-Assisted,Computer Simulation,DNA,DNA: analysis,DNA: genetics,Evolution,Genome,Humans,Image Processing,Molecular,Phylogeny,Species Specificity},
eprinttype = {pmid},
eprint = {10563018}
}
@article{DiazTacoa2009,
title = {{{TACOA}}: Taxonomic Classification of Environmental Genomic Fragments Using a Kernelized Nearest Neighbor Approach.},
volume = {10},
issn = {1471-2105},
doi = {10.1186/1471-2105-10-56},
abstract = {BACKGROUND: Metagenomics, or the sequencing and analysis of collective genomes (metagenomes) of microorganisms isolated from an environment, promises direct access to the "unculturable majority". This emerging field offers the potential to lay solid basis on our understanding of the entire living world. However, the taxonomic classification is an essential task in the analysis of metagenomics data sets that it is still far from being solved. We present a novel strategy to predict the taxonomic origin of environmental genomic fragments. The proposed classifier combines the idea of the k-nearest neighbor with strategies from kernel-based learning. RESULTS: Our novel strategy was extensively evaluated using the leave-one-out cross validation strategy on fragments of variable length (800 bp - 50 Kbp) from 373 completely sequenced genomes. TACOA is able to classify genomic fragments of length 800 bp and 1 Kbp with high accuracy until rank class. For longer fragments $\backslash$textgreater or = 3 Kbp accurate predictions are made at even deeper taxonomic ranks (order and genus). Remarkably, TACOA also produces reliable results when the taxonomic origin of a fragment is not represented in the reference set, thus classifying such fragments to its known broader taxonomic class or simply as "unknown". We compared the classification accuracy of TACOA with the latest intrinsic classifier PhyloPythia using 63 recently published complete genomes. For fragments of length 800 bp and 1 Kbp the overall accuracy of TACOA is higher than that obtained by PhyloPythia at all taxonomic ranks. For all fragment lengths, both methods achieved comparable high specificity results up to rank class and low false negative rates are also obtained. CONCLUSION: An accurate multi-class taxonomic classifier was developed for environmental genomic fragments. TACOA can predict with high reliability the taxonomic origin of genomic fragments as short as 800 bp. The proposed method is transparent, fast, accurate and the reference set can be easily updated as newly sequenced genomes become available. Moreover, the method demonstrated to be competitive when compared to the most current classifier PhyloPythia and has the advantage that it can be locally installed and the reference set can be kept up-to-date.},
timestamp = {2016-06-16T16:07:24Z},
journaltitle = {BMC bioinformatics},
author = {Diaz, Naryttza N and Krause, Lutz and Goesmann, Alexander and Niehaus, Karsten and Nattkemper, Tim W},
date = {2009-01},
pages = {56},
keywords = {Algorithms,Archaea,Archaea: classification,Archaea: genetics,Bacteria,Bacteria: classification,Bacteria: genetics,Classification,Classification: methods,Cluster Analysis,Environmental Microbiology,Genome,Genomics,Genomics: methods,metagenomics,Software,Software Validation,taxonomic binning},
eprinttype = {pmid},
eprint = {19210774}
}
@article{DrogeTaxatortk2014,
title = {Taxator-Tk: Precise Taxonomic Assignment of Metagenomes by Fast Approximation of Evolutionary Neighborhoods.},
issn = {1367-4811},
doi = {10.1093/bioinformatics/btu745},
abstract = {MOTIVATION: Metagenomics characterizes microbial communities by random shotgun sequencing of DNA isolated directly from an environment of interest. An essential step in computational metagenome analysis is taxonomic sequence assignment, which allows identifying the sequenced community members and reconstructing taxonomic bins with sequence data for the individual taxa. For the massive datasets generated by next-generation sequencing technologies, this cannot be performed with de-novo phylogenetic inference methods. We describe an algorithm and the accompanying software, taxator-tk, which performs taxonomic sequence assignment by fast approximate determination of evolutionary neighbors from sequence similarities. RESULTS: Taxator-tk was precise in its taxonomic assignment across all ranks and taxa for a range of evolutionary distances and for short as well as for long sequences. In addition to the taxonomic binning of metagenomes, it is well suited for profiling microbial communities from metagenome samples because it identifies bacterial, archaeal and eukaryotic community members without being affected by varying primer binding strengths, as in marker gene amplification, or copy number variations of marker genes across different taxa. Taxator-tk has an efficient, parallelized implementation that allows the assignment of 6 Gb of sequence data per day on a standard multiprocessor system with 10 CPU cores and microbial RefSeq as the genomic reference data. Availability and implementation: Taxator-tk source and binary program files are publicly available at http://algbio.cs.uni-duesseldorf.de/software/. CONTACT: [email protected] Supplementary information: Supplementary data are available at Bioinformatics online.},
timestamp = {2016-07-06T12:35:17Z},
eprinttype = {pubmed},
eprint = {25388150},
issue = {November 2014},
journaltitle = {Bioinformatics (Oxford, England)},
author = {Dröge, Johannes and Gregor, Ivan and McHardy, Alice C.},
date = {2014},
pages = {1--8}
}
@article{FinnHmmer2011,
title = {{{HMMER}} Web Server: Interactive Sequence Similarity Searching.},
volume = {39 Suppl 2},
issn = {1362-4962},
doi = {10.1093/nar/gkr367},
abstract = {HMMER is a software suite for protein sequence similarity searches using probabilistic methods. Previously, HMMER has mainly been available only as a computationally intensive UNIX command-line tool, restricting its use. Recent advances in the software, HMMER3, have resulted in a 100-fold speed gain relative to previous versions. It is now feasible to make efficient profile hidden Markov model (profile HMM) searches via the web. A HMMER web server (http://hmmer.janelia.org) has been designed and implemented such that most protein database searches return within a few seconds. Methods are available for searching either a single protein sequence, multiple protein sequence alignment or profile HMM against a target sequence database, and for searching a protein sequence against Pfam. The web server is designed to cater to a range of different user expertise and accepts batch uploading of multiple queries at once. All search methods are also available as RESTful web services, thereby allowing them to be readily integrated as remotely executed tasks in locally scripted workflows. We have focused on minimizing search times and the ability to rapidly display tabular results, regardless of the number of matches found, developing graphical summaries of the search results to provide quick, intuitive appraisement of them.},
timestamp = {2016-06-16T16:07:29Z},
issue = {May},
journaltitle = {Nucleic acids research},
author = {Finn, Robert D and Clements, Jody and Eddy, Sean R},
date = {2011-07},
pages = {W29--37},
eprinttype = {pmid},
eprint = {21593126}
}
@article{FrithParameters2010,
title = {Parameters for Accurate Genome Alignment.},
volume = {11},
issn = {1471-2105},
doi = {10.1186/1471-2105-11-80},
abstract = {BACKGROUND: Genome sequence alignments form the basis of much research. Genome alignment depends on various mundane but critical choices, such as how to mask repeats and which score parameters to use. Surprisingly, there has been no large-scale assessment of these choices using real genomic data. Moreover, rigorous procedures to control the rate of spurious alignment have not been employed. RESULTS: We have assessed 495 combinations of score parameters for alignment of animal, plant, and fungal genomes. As our gold-standard of accuracy, we used genome alignments implied by multiple alignments of proteins and of structural RNAs. We found the HOXD scoring schemes underlying alignments in the UCSC genome database to be far from optimal, and suggest better parameters. Higher values of the X-drop parameter are not always better. E-values accurately indicate the rate of spurious alignment, but only if tandem repeats are masked in a non-standard way. Finally, we show that gamma-centroid (probabilistic) alignment can find highly reliable subsets of aligned bases. CONCLUSIONS: These results enable more accurate genome alignment, with reliability measures for local alignments and for individual aligned bases. This study was made possible by our new software, LAST, which can align vertebrate genomes in a few hours http://last.cbrc.jp/.},
timestamp = {2016-06-16T16:07:30Z},
journaltitle = {BMC bioinformatics},
author = {Frith, Martin C and Hamada, Michiaki and Horton, Paul},
date = {2010-01},
pages = {80},
keywords = {Base Sequence,Computational Biology,Computational Biology: methods,Genome,Molecular Sequence Data,Proteins,Proteins: chemistry,RNA,RNA: chemistry,Sequence Alignment,Sequence Alignment: methods,Sequence Analysis},
eprinttype = {pmid},
eprint = {20144198}
}
@article{FuhrmanMicrobial2009,
title = {Microbial Community Structure and Its Functional Implications.},
volume = {459},
issn = {1476-4687},
doi = {10.1038/nature08058},
abstract = {Marine microbial communities are engines of globally important processes, such as the marine carbon, nitrogen and sulphur cycles. Recent data on the structures of these communities show that they adhere to universal biological rules. Co-occurrence patterns can help define species identities, and systems-biology tools are revealing networks of interacting microorganisms. Some microbial systems are found to change predictably, helping us to anticipate how microbial communities and their activities will shift in a changing world.},
timestamp = {2016-06-16T16:07:31Z},
number = {7244},
journaltitle = {Nature},
author = {a Fuhrman, Jed},
date = {2009-05},
pages = {193--9},
keywords = {Animals,Ecosystem,Genomics,Genomics: methods,Genomics: trends,Greenhouse Effect,Marine Biology,Water Microbiology},
eprinttype = {pmid},
eprint = {19444205}
}
@article{GerlachTaxonomic2011,
title = {Taxonomic Classification of Metagenomic Shotgun Sequences with {{CARMA3}}.},
issn = {1362-4962},
doi = {10.1093/nar/gkr225},
abstract = {The vast majority of microbes are unculturable and thus cannot be sequenced by means of traditional methods. High-throughput sequencing techniques like 454 or Solexa-Illumina make it possible to explore those microbes by studying whole natural microbial communities and analysing their biological diversity as well as the underlying metabolic pathways. Over the past few years, different methods have been developed for the taxonomic and functional characterization of metagenomic shotgun sequences. However, the taxonomic classification of metagenomic sequences from novel species without close homologue in the biological sequence databases poses a challenge due to the high number of wrong taxonomic predictions on lower taxonomic ranks. Here we present CARMA3, a new method for the taxonomic classification of assembled and unassembled metagenomic sequences that has been adapted to work with both BLAST and HMMER3 homology searches. We show that our method makes fewer wrong taxonomic predictions (at the same sensitivity) than other BLAST-based methods. CARMA3 is freely accessible via the web application WebCARMA from http://webcarma.cebitec.uni-bielefeld.de.},
timestamp = {2016-06-16T16:07:33Z},
journaltitle = {Nucleic acids research},
author = {Gerlach, Wolfgang and Stoye, Jens},
date = {2011-05},
pages = {1--11},
eprinttype = {pmid},
eprint = {21586583}
}
@article{GregorPhylopythias2014,
title = {{{PhyloPythiaS}}+: {{A}} Self-Training Method for the Rapid Reconstruction of Low-Ranking Taxonomic Bins from Metagenomes},
abstract = {Metagenomics is an approach for characterizing environmental microbial communities in situ, it allows their functional and taxonomic characterization and to recover sequences from uncultured taxa. For communities of up to medium diversity, e.g. excluding environments such as soil, this is often achieved by a combination of sequence assembly and binning, where sequences are grouped into 'bins' representing taxa of the underlying microbial community from which they originate. Assignment to low-ranking taxonomic bins is an important challenge for binning methods as is scalability to Gb-sized datasets generated with deep sequencing techniques. One of the best available methods for the recovery of species bins from an individual metagenome sample is the expert-trained PhyloPythiaS package, where a human expert decides on the taxa to incorporate in a composition-based taxonomic metagenome classifier and identifies the 'training' sequences using marker genes directly from the sample. Due to the manual effort involved, this approach does not scale to multiple metagenome samples and requires substantial expertise, which researchers who are new to the area may not have. With these challenges in mind, we have developed PhyloPythiaS+, a successor to our previously described method PhyloPythia(S). The newly developed + component performs the work previously done by the human expert. PhyloPythiaS+ also includes a new k-mer counting algorithm, which accelerated k-mer counting 100-fold and reduced the overall execution time of the software by a factor of three. Our software allows to analyze Gb-sized metagenomes with inexpensive hardware, and to recover species or genera-level bins with low error rates in a fully automated fashion.},
timestamp = {2016-06-16T16:07:38Z},
journaltitle = {arxiv.org},
author = {Gregor, I and Dröge, J. and Schirmer, M and Quince, C and McHardy, A. C.},
date = {2014-06},
pages = {1--67}
}
@article{HamadyMicrobial2009,
title = {Microbial Community Profiling for Human Microbiome Projects: {{Tools}}, Techniques, and Challenges.},
volume = {19},
issn = {1088-9051},
doi = {10.1101/gr.085464.108},
abstract = {High-throughput sequencing studies and new software tools are revolutionizing microbial community analyses, yet the variety of experimental and computational methods can be daunting. In this review, we discuss some of the different approaches to community profiling, highlighting strengths and weaknesses of various experimental approaches, sequencing methodologies, and analytical methods. We also address one key question emerging from various Human Microbiome Projects: Is there a substantial core of abundant organisms or lineages that we all share? It appears that in some human body habitats, such as the hand and the gut, the diversity among individuals is so great that we can rule out the possibility that any species is at high abundance in all individuals: It is possible that the focus should instead be on higher-level taxa or on functional genes instead.},
timestamp = {2016-06-16T16:07:38Z},
number = {7},
journaltitle = {Genome research},
author = {Hamady, Micah and Knight, Rob},
date = {2009-07},
pages = {1141--52},
keywords = {16S,16S: classification,16S: genetics,Gene Expression Profiling,Humans,Metagenome,Ribosomal,RNA},
eprinttype = {pmid},
eprint = {19383763}
}
@article{HandelsmanMetagenomics2004,
title = {Metagenomics: Application of Genomics to Uncultured Microorganisms.},
volume = {68},
issn = {1092-2172},
doi = {10.1128/MMBR.68.4.669-685.2004},
abstract = {Metagenomics (also referred to as environmental and community genomics) is the genomic analysis of microorganisms by direct extraction and cloning of DNA from an assemblage of microorganisms. The development of metagenomics stemmed from the ineluctable evidence that as-yet-uncultured microorganisms represent the vast majority of organisms in most environments on earth. This evidence was derived from analyses of 16S rRNA gene sequences amplified directly from the environment, an approach that avoided the bias imposed by culturing and led to the discovery of vast new lineages of microbial life. Although the portrait of the microbial world was revolutionized by analysis of 16S rRNA genes, such studies yielded only a phylogenetic description of community membership, providing little insight into the genetics, physiology, and biochemistry of the members. Metagenomics provides a second tier of technical innovation that facilitates study of the physiology and ecology of environmental microorganisms. Novel genes and gene products discovered through metagenomics include the first bacteriorhodopsin of bacterial origin; novel small molecules with antimicrobial activity; and new members of families of known proteins, such as an Na(+)(Li(+))/H(+) antiporter, RecA, DNA polymerase, and antibiotic resistance determinants. Reassembly of multiple genomes has provided insight into energy and nutrient cycling within the community, genome structure, gene function, population genetics and microheterogeneity, and lateral gene transfer among members of an uncultured community. The application of metagenomic sequence information will facilitate the design of better culturing strategies to link genomic analysis with pure culture studies.},
timestamp = {2016-06-16T16:07:39Z},
number = {4},
journaltitle = {Microbiology and molecular biology reviews : MMBR},
author = {Handelsman, Jo},
date = {2004-12},
pages = {669--85},
keywords = {Bacterial,Biotechnology,Ecology,Environmental Microbiology,Genetics,Genome,Genomics,Genomics: methods,Microbial},
eprinttype = {pmid},
eprint = {15590779}
}
@article{HauswedellLambda2014,
title = {Lambda: The Local Aligner for Massive Biological Data},
volume = {30},
issn = {1367-4803},
doi = {10.1093/bioinformatics/btu439},
timestamp = {2016-06-16T16:07:39Z},
number = {17},
journaltitle = {Bioinformatics},
author = {Hauswedell, H. and Singer, J. and Reinert, K.},
date = {2014-08},
pages = {i349--i355}
}
@article{HessMetagenomic2011,
title = {Metagenomic Discovery of Biomass-Degrading Genes and Genomes from Cow Rumen.},
volume = {331},
issn = {1095-9203},
doi = {10.1126/science.1200387},
abstract = {The paucity of enzymes that efficiently deconstruct plant polysaccharides represents a major bottleneck for industrial-scale conversion of cellulosic biomass into biofuels. Cow rumen microbes specialize in degradation of cellulosic plant material, but most members of this complex community resist cultivation. To characterize biomass-degrading genes and genomes, we sequenced and analyzed 268 gigabases of metagenomic DNA from microbes adherent to plant fiber incubated in cow rumen. From these data, we identified 27,755 putative carbohydrate-active genes and expressed 90 candidate proteins, of which 57\% were enzymatically active against cellulosic substrates. We also assembled 15 uncultured microbial genomes, which were validated by complementary methods including single-cell genome sequencing. These data sets provide a substantially expanded catalog of genes and genomes participating in the deconstruction of cellulosic biomass.},
timestamp = {2016-06-16T16:07:39Z},
number = {6016},
journaltitle = {Science (New York, N.Y.)},
author = {Hess, Matthias and Sczyrba, Alexander and Egan, Rob and Kim, Tae-Wan and Chokhawala, Harshal and Schroth, Gary and Luo, Shujun and Clark, Douglas S and Chen, Feng and Zhang, Tao and Mackie, Roderick I and a Pennacchio, Len and Tringe, Susannah G and Visel, Axel and Woyke, Tanja and Wang, Zhong and Rubin, Edward M},
date = {2011-01},
pages = {463--7},
keywords = {4-beta-Cellobiosidase,4-beta-Cellobiosidase: genetics,4-beta-Cellobiosidase: metabolism,Amino Acid Sequence,Animals,Bacteria,Bacteria: enzymology,Bacteria: genetics,Bacteria: isolation & purification,Bacterial,Bacterial Proteins,Bacterial Proteins: chemistry,Bacterial Proteins: genetics,Bacterial Proteins: metabolism,Bacteria: metabolism,Biomass,Carbohydrate Metabolism,Cattle,Cattle: microbiology,Cellulase,Cellulase: genetics,Cellulase: metabolism,Cellulases,Cellulases: chemistry,Cellulases: genetics,Cellulases: metabolism,Cellulose,Cellulose 1,Cellulose: metabolism,DNA,Genes,Genome,Metagenome,metagenomics,Metagenomics: methods,Molecular Sequence Annotation,Molecular Sequence Data,Poaceae,Poaceae: microbiology,Rumen,Rumen: metabolism,Rumen: microbiology,Sequence Analysis},
eprinttype = {pmid},
eprint = {21273488}
}
@article{HuPirs2012,
title = {{{pIRS}}: {{Profile}}-Based {{Illumina}} Pair-End Reads Simulator.},
volume = {28},
issn = {1367-4811},
doi = {10.1093/bioinformatics/bts187},
abstract = {The next-generation high-throughput sequencing technologies, especially from Illumina, have been widely used in re-sequencing and de novo assembly studies. However, there is no existing software that can simulate Illumina reads with real error and quality distributions and coverage bias yet, which is very useful in relevant software development and study designing of sequencing projects.},
timestamp = {2016-06-16T16:07:40Z},
number = {11},
journaltitle = {Bioinformatics (Oxford, England)},
author = {Hu, Xuesong and Yuan, Jianying and Shi, Yujian and Lu, Jianliang and Liu, Binghang and Li, Zhenyu and Chen, Yanxiang and Mu, Desheng and Zhang, Hao and Li, Nan and Yue, Zhen and Bai, Fan and Li, Heng and Fan, Wei},
date = {2012-06},
pages = {1533--5},
eprinttype = {pmid},
eprint = {22508794}
}
@article{HuangArt2012,
title = {{{ART}}: A next-Generation Sequencing Read Simulator.},
volume = {28},
issn = {1367-4811},
doi = {10.1093/bioinformatics/btr708},
abstract = {ART is a set of simulation tools that generate synthetic next-generation sequencing reads. This functionality is essential for testing and benchmarking tools for next-generation sequencing data analysis including read alignment, de novo assembly and genetic variation discovery. ART generates simulated sequencing reads by emulating the sequencing process with built-in, technology-specific read error models and base quality value profiles parameterized empirically in large sequencing datasets. We currently support all three major commercial next-generation sequencing platforms: Roche's 454, Illumina's Solexa and Applied Biosystems' SOLiD. ART also allows the flexibility to use customized read error model parameters and quality profiles.},
timestamp = {2017-02-17T20:48:09Z},
number = {4},
journaltitle = {Bioinformatics (Oxford, England)},
author = {Huang, Weichun and Li, Leping and Myers, Jason R and Marth, Gabor T},
date = {2012-02},
pages = {593--4},
eprinttype = {pmid},
eprint = {22199392}
}
@article{HugenholtzExploring2002,
title = {Exploring Prokaryotic Diversity in the Genomic Era.},
volume = {3},
issn = {1465-6914},
abstract = {Our understanding of prokaryote biology from study of pure cultures and genome sequencing has been limited by a pronounced sampling bias towards four bacterial phyla - Proteobacteria, Firmicutes, Actinobacteria and Bacteroidetes - out of 35 bacterial and 18 archaeal phylum-level lineages. This bias is beginning to be rectified by the use of phylogenetically directed isolation strategies and by directly accessing microbial genomes from environmental samples.},
timestamp = {2016-06-16T16:07:40Z},
number = {2},
journaltitle = {Genome biology},
author = {Hugenholtz, Philip},
date = {2002-01},
pages = {REVIEWS0003},
keywords = {Actinobacteria,Actinobacteria: genetics,Actinobacteria: isolation & purification,Bacterial,Bacterial: genetics,Bacteroidaceae,Bacteroidaceae: genetics,Bacteroidaceae: isolation & purification,Genes,Genetic Variation,Genetic Variation: genetics,Genome,Gram-Positive Bacteria,Gram-Positive Bacteria: genetics,Gram-Positive Bacteria: isolation & purification,Phylogeny,Proteobacteria,Proteobacteria: genetics,Proteobacteria: isolation & purification},
eprinttype = {pmid},
eprint = {11864374}
}
@article{HuseExploring2008,
title = {Exploring Microbial Diversity and Taxonomy Using {{SSU rRNA}} Hypervariable Tag Sequencing.},
volume = {4},
issn = {1553-7404},
doi = {10.1371/journal.pgen.1000255},
abstract = {Massively parallel pyrosequencing of hypervariable regions from small subunit ribosomal RNA (SSU rRNA) genes can sample a microbial community two or three orders of magnitude more deeply per dollar and per hour than capillary sequencing of full-length SSU rRNA. As with full-length rRNA surveys, each sequence read is a tag surrogate for a single microbe. However, rather than assigning taxonomy by creating gene trees de novo that include all experimental sequences and certain reference taxa, we compare the hypervariable region tags to an extensive database of rRNA sequences and assign taxonomy based on the best match in a Global Alignment for Sequence Taxonomy (GAST) process. The resulting taxonomic census provides information on both composition and diversity of the microbial community. To determine the effectiveness of using only hypervariable region tags for assessing microbial community membership, we compared the taxonomy assigned to the V3 and V6 hypervariable regions with the taxonomy assigned to full-length SSU rRNA sequences isolated from both the human gut and a deep-sea hydrothermal vent. The hypervariable region tags and full-length rRNA sequences provided equivalent taxonomy and measures of relative abundance of microbial communities, even for tags up to 15\% divergent from their nearest reference match. The greater sampling depth per dollar afforded by massively parallel pyrosequencing reveals many more members of the "rare biosphere" than does capillary sequencing of the full-length gene. In addition, tag sequencing eliminates cloning bias and the sequences are short enough to be completely sequenced in a single read, maximizing the number of organisms sampled in a run while minimizing chimera formation. This technique allows the cost-effective exploration of changes in microbial community structure, including the rare biosphere, over space and time and can be applied immediately to initiatives, such as the Human Microbiome Project.},
timestamp = {2016-06-16T16:07:42Z},
number = {11},
journaltitle = {PLoS genetics},
author = {Huse, Susan M and Dethlefsen, Les and Huber, Julie A and Welch, David Mark and Relman, David A and Sogin, Mitchell L},
date = {2008-11},
pages = {e1000255},
keywords = {Bacteria,Bacteria: classification,Bacteria: genetics,Biodiversity,Classification,Classification: methods,DNA,Humans,Metagenome,Metagenome: genetics,Ribosomal,Ribosomal: genetics,RNA,Sequence Analysis,Sequence Tagged Sites},
eprinttype = {pmid},
eprint = {19023400}
}
@article{HusonIntegrative2011,
title = {Integrative Analysis of Environmental Sequences Using {{MEGAN4}}.},
volume = {21},
issn = {1549-5469},
doi = {10.1101/gr.120618.111},
abstract = {A major challenge in the analysis of environmental sequences is data integration. The question is how to analyze different types of data in a unified approach, addressing both the taxonomic and functional aspects. To facilitate such analyses, we have substantially extended MEGAN, a widely used taxonomic analysis program. The new program, MEGAN4, provides an integrated approach to the taxonomic and functional analysis of metagenomic, metatranscriptomic, metaproteomic, and rRNA data. While taxonomic analysis is performed based on the NCBI taxonomy, functional analysis is performed using the SEED classification of subsystems and functional roles or the KEGG classification of pathways and enzymes. A number of examples illustrate how such analyses can be performed, and show that one can also import and compare classification results obtained using others' tools. MEGAN4 is freely available for academic purposes, and installers for all three major operating systems can be downloaded from www-ab.informatik.uni-tuebingen.de/software/megan.},
timestamp = {2016-06-16T16:07:43Z},
number = {9},
journaltitle = {Genome research},
author = {Huson, Daniel H and Mitra, Suparna and Ruscheweyh, Hans-Joachim and Weber, Nico and Schuster, Stephan C},
date = {2011-09},
pages = {1552--60},
keywords = {16S,16S: genetics,Classification,Metagenome,Metagenome: genetics,metagenomics,Proteomics,Ribosomal,RNA,Software,Transcriptome},
eprinttype = {pmid},
eprint = {21690186}
}
@article{HusonPoor2014,
title = {A Poor Man's {{BLASTX}}–high-Throughput Metagenomic Protein Database Search Using {{PAUDA}}.},
volume = {30},
issn = {1367-4811},
doi = {10.1093/bioinformatics/btt254},
abstract = {SUMMARY: In the context of metagenomics, we introduce a new approach to protein database search called PAUDA, which runs ∼10,000 times faster than BLASTX, while achieving about one-third of the assignment rate of reads to KEGG orthology groups, and producing gene and taxon abundance profiles that are highly correlated to those obtained with BLASTX. PAUDA requires $\backslash$textless80 CPU hours to analyze a dataset of 246 million Illumina DNA reads from permafrost soil for which a previous BLASTX analysis (on a subset of 176 million reads) reportedly required 800,000 CPU hours, leading to the same clustering of samples by functional profiles. AVAILABILITY: PAUDA is freely available from: http://ab.inf.uni-tuebingen.de/software/pauda. Also supplementary method details are available from this website.},
timestamp = {2016-06-16T16:07:43Z},
number = {1},
journaltitle = {Bioinformatics (Oxford, England)},
author = {Huson, Daniel H and Xie, Chao},
date = {2014-01},
pages = {38--9},
keywords = {Algorithms,Amino Acid Sequence,Base Sequence,Databases,DNA,metagenomics,Metagenomics: methods,Protein,Sequence Analysis,Software},
eprinttype = {pmid},
eprint = {23658416}
}
@article{ImelfortGroopm2014,
title = {{{GroopM}}: An Automated Tool for the Recovery of Population Genomes from Related Metagenomes},
volume = {2},
issn = {2167-8359},
doi = {10.7717/peerj.603},
timestamp = {2016-06-16T16:07:43Z},
journaltitle = {PeerJ},
author = {Imelfort, Michael and Parks, Donovan and Woodcroft, Ben J. and Dennis, Paul and Hugenholtz, Philip and Tyson, Gene W.},
date = {2014-09},
pages = {e603},
keywords = {bioinformatics,metagenomics,microbial ecology,population genome binning}
}
@article{IversonUntangling2012,
title = {Untangling Genomes from Metagenomes: Revealing an Uncultured Class of Marine {{Euryarchaeota}}},
volume = {335},
issn = {0036-8075},
doi = {10.1126/science.1212665},
timestamp = {2016-06-16T16:07:43Z},
number = {6068},
journaltitle = {Science},
author = {Iverson, V. and Morris, R. M. and Frazar, C. D. and Berthiaume, C. T. and Morales, R. L. and Armbrust, E. V.},
date = {2012-02},
pages = {587--590}
}
@article{JeewonDetection2007,
title = {Detection and Diversity of Fungi from Environmental Samples: Traditional versus Molecular Approaches},
volume = {11},
timestamp = {2016-06-16T16:07:44Z},
journaltitle = {Advanced techniques in soil microbiology},
author = {Jeewon, R and Hyde, K. D.},
date = {2007},
pages = {1--15}
}
@article{KarlinDinucleotide1995,
title = {Dinucleotide Relative Abundance Extremes: A Genomic Signature.},
volume = {11},
issn = {0168-9525},
abstract = {Early biochemical experiments established that the set of dinucleotide odds ratios or 'general design' is a remarkably stable property of the DNA of an organism, which is essentially the same in protein-coding DNA, bulk genomic DNA, and in different renaturation rate and density gradient fractions of genomic DNA in many organisms. Analysis of currently available genomic sequence data has extended these earlier results, showing that the general designs of disjoint samples of a genome are substantially more similar to each other than to those of sequences from other organisms and that closely related organisms have similar general designs. From this perspective, the set of dinucleotide odds ratio (relative abundance) values constitute a signature of each DNA genome, which can discriminate between sequences from different organisms. Dinucleotide-odds ratio values appear to reflect not only the chemistry of dinucleotide stacking energies and base-step conformational preferences, but also the species-specific properties of DNA modification, replication and repair mechanisms.},
timestamp = {2016-06-16T16:07:44Z},
number = {7},
journaltitle = {Trends in genetics},
author = {Karlin, S and Burge, C},
date = {1995-07},
pages = {283--90},
keywords = {Animals,CpG Islands,Dinucleotide Repeats,DNA,DNA: genetics,Genome},
eprinttype = {pmid},
eprint = {7482779}
}
@article{KislyukUnsupervised2009,
title = {Unsupervised Statistical Clustering of Environmental Shotgun Sequences.},
volume = {10},
issn = {1471-2105},
doi = {10.1186/1471-2105-10-316},
abstract = {The development of effective environmental shotgun sequence binning methods remains an ongoing challenge in algorithmic analysis of metagenomic data. While previous methods have focused primarily on supervised learning involving extrinsic data, a first-principles statistical model combined with a self-training fitting method has not yet been developed.},
timestamp = {2017-02-17T20:47:53Z},
journaltitle = {BMC bioinformatics},
author = {Kislyuk, Andrey and Bhatnagar, Srijak and Dushoff, Jonathan and Weitz, Joshua S},
date = {2009-01},
pages = {316},
keywords = {Algorithms,Cluster Analysis,DNA,DNA: methods,Genome,Genomics,Genomics: methods,metagenomics,Sequence Analysis,taxonomic binning,unsupervised taxonomic binning},
eprinttype = {pmid},
eprint = {19799776}
}
@article{KlumppNext2012,
title = {Next Generation Sequencing Technologies and the Changing Landscape of Phage Genomics.},
volume = {2},
issn = {2159-7073},
doi = {10.4161/bact.22111},
abstract = {The dawn of next generation sequencing technologies has opened up exciting possibilities for whole genome sequencing of a plethora of organisms. The 2nd and 3rd generation sequencing technologies, based on cloning-free, massively parallel sequencing, have enabled the generation of a deluge of genomic sequences of both prokaryotic and eukaryotic origin in the last seven years. However, whole genome sequencing of bacterial viruses has not kept pace with this revolution, despite the fact that their genomes are orders of magnitude smaller in size compared with bacteria and other organisms. Sequencing phage genomes poses several challenges; (1) obtaining pure phage genomic material, (2) PCR amplification biases and (3) complex nature of their genetic material due to features such as methylated bases and repeats that are inherently difficult to sequence and assemble. Here we describe conclusions drawn from our efforts in sequencing hundreds of bacteriophage genomes from a variety of Gram-positive and Gram-negative bacteria using Sanger, 454, Illumina and PacBio technologies. Based on our experience we propose several general considerations regarding sample quality, the choice of technology and a "blended approach" for generating reliable whole genome sequences of phages.},
timestamp = {2016-06-16T16:07:47Z},
number = {3},
journaltitle = {Bacteriophage},
author = {Klumpp, Jochen and Fouts, Derrick E and Sozhamannan, Shanmuga},
date = {2012-07},
pages = {190--199},
keywords = {assembly,bacteriophage genome,hybrid genome,illumina hiseq,PacBio,roche 454,sanger sequencing,scaffolding,sispa},
eprinttype = {pmid},
eprint = {23275870}
}
@article{KorenBambus2011,
title = {Bambus 2: {{Scaffolding}} Metagenomes},
volume = {27},
issn = {1367-4803},
doi = {10.1093/bioinformatics/btr520},
timestamp = {2016-06-16T16:07:47Z},
number = {21},
journaltitle = {Bioinformatics},
author = {Koren, S. and Treangen, T. J. and Pop, M.},
date = {2011-09},
pages = {2964--2971}
}
@article{KoslickiQuikr2013,
title = {Quikr: A Method for Rapid Reconstruction of Bacterial Communities via Compressive Sensing.},
volume = {29},
issn = {1367-4811},
doi = {10.1093/bioinformatics/btt336},
abstract = {MOTIVATION: Many metagenomic studies compare hundreds to thousands of environmental and health-related samples by extracting and sequencing their 16S rRNA amplicons and measuring their similarity using beta-diversity metrics. However, one of the first steps–to classify the operational taxonomic units within the sample–can be a computationally time-consuming task because most methods rely on computing the taxonomic assignment of each individual read out of tens to hundreds of thousands of reads. RESULTS: We introduce Quikr: a QUadratic, K-mer-based, Iterative, Reconstruction method, which computes a vector of taxonomic assignments and their proportions in the sample using an optimization technique motivated from the mathematical theory of compressive sensing. On both simulated and actual biological data, we demonstrate that Quikr typically has less error and is typically orders of magnitude faster than the most commonly used taxonomic assignment technique (the Ribosomal Database Project's Naïve Bayesian Classifier). Furthermore, the technique is shown to be unaffected by the presence of chimeras, thereby allowing for the circumvention of the time-intensive step of chimera filtering. AVAILABILITY: The Quikr computational package (in MATLAB, Octave, Python and C) for the Linux and Mac platforms is available at http://sourceforge.net/projects/quikr/.},
timestamp = {2016-06-16T16:07:48Z},
number = {17},
journaltitle = {Bioinformatics (Oxford, England)},
author = {Koslicki, David and Foucart, Simon and Rosen, Gail},
date = {2013-09},
pages = {2096--102},
keywords = {16S,16S: genetics,Algorithms,Bacteria,Bacteria: classification,Bacteria: genetics,Bacteria: isolation & purification,Bayes Theorem,Classification,Classification: methods,DNA,DNA: methods,metagenomics,Microbiota,Phylogeny,Ribosomal,RNA,Sequence Analysis,Software},
eprinttype = {pmid},
eprint = {23786768}
}
@article{LasersonGenovo2011,
title = {Genovo: De Novo Assembly for Metagenomes.},
volume = {18},
issn = {1557-8666},
doi = {10.1089/cmb.2010.0244},
abstract = {Next-generation sequencing technologies produce a large number of noisy reads from the DNA in a sample. Metagenomics and population sequencing aim to recover the genomic sequences of the species in the sample, which could be of high diversity. Methods geared towards single sequence reconstruction are not sensitive enough when applied in this setting. We introduce a generative probabilistic model of read generation from environmental samples and present Genovo, a novel de novo sequence assembler that discovers likely sequence reconstructions under the model. A nonparametric prior accounts for the unknown number of genomes in the sample. Inference is performed by applying a series of hill-climbing steps iteratively until convergence. We compare the performance of Genovo to three other short read assembly programs in a series of synthetic experiments and across nine metagenomic datasets created using the 454 platform, the largest of which has 311k reads. Genovo's reconstructions cover more bases and recover more genes than the other methods, even for low-abundance sequences, and yield a higher assembly score. Supplementary Material is available at www.liebertoinline.com/cmb .},
timestamp = {2016-06-16T16:07:51Z},
number = {3},
journaltitle = {Journal of computational biology},
author = {Laserson, Jonathan and Jojic, Vladimir and Koller, Daphne},
date = {2011-03},
pages = {429--43},
keywords = {Algorithms,Animals,DNA,DNA: methods,Humans,Metagenome,metagenomics,Metagenomics: methods,Models,Sequence Analysis,Statistical},
eprinttype = {pmid},
eprint = {21385045}
}
@article{LindnerMetagenomic2013,
title = {Metagenomic Abundance Estimation and Diagnostic Testing on Species Level.},
volume = {41},
issn = {1362-4962},
doi = {10.1093/nar/gks803},
abstract = {One goal of sequencing-based metagenomic community analysis is the quantitative taxonomic assessment of microbial community compositions. In particular, relative quantification of taxons is of high relevance for metagenomic diagnostics or microbial community comparison. However, the majority of existing approaches quantify at low resolution (e.g. at phylum level), rely on the existence of special genes (e.g. 16S), or have severe problems discerning species with highly similar genome sequences. Yet, problems as metagenomic diagnostics require accurate quantification on species level. We developed Genome Abundance Similarity Correction (GASiC), a method to estimate true genome abundances via read alignment by considering reference genome similarities in a non-negative LASSO approach. We demonstrate GASiC's superior performance over existing methods on simulated benchmark data as well as on real data. In addition, we present applications to datasets of both bacterial DNA and viral RNA source. We further discuss our approach as an alternative to PCR-based DNA quantification.},
timestamp = {2016-06-16T16:07:51Z},
number = {1},
journaltitle = {Nucleic acids research},
author = {Lindner, Martin S and Renard, Bernhard Y},
date = {2013-01},
pages = {e10},
keywords = {Algorithms,Bacterial,Bacterial: analysis,Bacterial: chemistry,Classification,Classification: methods,DNA,Escherichia coli,Escherichia coli: genetics,metagenomics,Metagenomics: methods,RNA,Sequence Alignment,Viral,Viral: analysis,Viral: chemistry},
eprinttype = {pmid},
eprint = {22941661}
}
@article{LucksGenome2008,
title = {Genome Landscapes and Bacteriophage Codon Usage.},
volume = {4},
issn = {1553-7358},
doi = {10.1371/journal.pcbi.1000001},
abstract = {Across all kingdoms of biological life, protein-coding genes exhibit unequal usage of synonymous codons. Although alternative theories abound, translational selection has been accepted as an important mechanism that shapes the patterns of codon usage in prokaryotes and simple eukaryotes. Here we analyze patterns of codon usage across 74 diverse bacteriophages that infect E. coli, P. aeruginosa, and L. lactis as their primary host. We use the concept of a "genome landscape," which helps reveal non-trivial, long-range patterns in codon usage across a genome. We develop a series of randomization tests that allow us to interrogate the significance of one aspect of codon usage, such as GC content, while controlling for another aspect, such as adaptation to host-preferred codons. We find that 33 phage genomes exhibit highly non-random patterns in their GC3-content, use of host-preferred codons, or both. We show that the head and tail proteins of these phages exhibit significant bias towards host-preferred codons, relative to the non-structural phage proteins. Our results support the hypothesis of translational selection on viral genes for host-preferred codons, over a broad range of bacteriophages.},
timestamp = {2016-06-16T16:07:53Z},
number = {2},
journaltitle = {PLoS computational biology},
author = {Lucks, Julius B and Nelson, David R and Kudla, Grzegorz R and Plotkin, Joshua B},
date = {2008-02},
pages = {e1000001},
keywords = {Bacteriophages,Bacteriophages: genetics,Biological Evolution,Chromosome Mapping,Chromosome Mapping: methods,Computer Simulation,Evolution,Genes,Genetic,Genetic Variation,Genetic Variation: genetics,Genome,Models,Molecular,Species Specificity,Viral,Viral: genetics},
eprinttype = {pmid},
eprint = {18463708}
}
@article{LuoSoapdenovo22012,
title = {{{SOAPdenovo2}}: An Empirically Improved Memory-Efficient Short-Read de Novo Assembler.},
volume = {1},
issn = {2047-217X},
doi = {10.1186/2047-217X-1-18},
abstract = {BACKGROUND: There is a rapidly increasing amount of de novo genome assembly using next-generation sequencing (NGS) short reads; however, several big challenges remain to be overcome in order for this to be efficient and accurate. SOAPdenovo has been successfully applied to assemble many published genomes, but it still needs improvement in continuity, accuracy and coverage, especially in repeat regions. FINDINGS: To overcome these challenges, we have developed its successor, SOAPdenovo2, which has the advantage of a new algorithm design that reduces memory consumption in graph construction, resolves more repeat regions in contig assembly, increases coverage and length in scaffold construction, improves gap closing, and optimizes for large genome. CONCLUSIONS: Benchmark using the Assemblathon1 and GAGE datasets showed that SOAPdenovo2 greatly surpasses its predecessor SOAPdenovo and is competitive to other assemblers on both assembly length and accuracy. We also provide an updated assembly version of the 2008 Asian (YH) genome using SOAPdenovo2. Here, the contig and scaffold N50 of the YH genome were ∼20.9 kbp and ∼22 Mbp, respectively, which is 3-fold and 50-fold longer than the first published version. The genome coverage increased from 81.16\% to 93.91\%, and memory consumption was ∼2/3 lower during the point of largest memory consumption.},
timestamp = {2016-06-16T16:07:53Z},
number = {1},
journaltitle = {GigaScience},
author = {Luo, Ruibang and Liu, Binghang and Xie, Yinlong and Li, Zhenyu and Huang, Weihua and Yuan, Jianying and He, Guangzhu and Chen, Yanxiang and Pan, Qi and Liu, Yunjie and Tang, Jingbo and Wu, Gengxiong and Zhang, Hao and Shi, Yujian and Liu, Yong and Yu, Chang and Wang, Bo and Lu, Yao and Han, Changlei and Cheung, David W and Yiu, Siu-Ming and Peng, Shaoliang and Xiaoqian, Zhu and Liu, Guangming and Liao, Xiangke and Li, Yingrui and Yang, Huanming and Wang, Jian and Lam, Tak-Wah and Wang, Jun},
date = {2012-01},
pages = {18},
keywords = {assembly,contig,error correction,gap-filling,Genome,scaffold},
eprinttype = {pmid},
eprint = {23587118}
}
@article{MackelprangMetagenomic2011,
title = {Metagenomic Analysis of a Permafrost Microbial Community Reveals a Rapid Response to Thaw},
volume = {480},
issn = {0028-0836},
doi = {10.1038/nature10576},
timestamp = {2016-06-16T16:07:56Z},
number = {7377},
journaltitle = {Nature},
author = {Mackelprang, Rachel and Waldrop, Mark P. and DeAngelis, Kristen M. and David, Maude M. and Chavarria, Krystle L. and Blazewicz, Steven J. and Rubin, Edward M. and Jansson, Janet K.},
date = {2011-11},
pages = {368--371}
}
@article{MardisImpact2008,
title = {The Impact of Next-Generation Sequencing Technology on Genetics.},
volume = {24},
issn = {0168-9525},
doi = {10.1016/j.tig.2007.12.007},
abstract = {If one accepts that the fundamental pursuit of genetics is to determine the genotypes that explain phenotypes, the meteoric increase of DNA sequence information applied toward that pursuit has nowhere to go but up. The recent introduction of instruments capable of producing millions of DNA sequence reads in a single run is rapidly changing the landscape of genetics, providing the ability to answer questions with heretofore unimaginable speed. These technologies will provide an inexpensive, genome-wide sequence readout as an endpoint to applications ranging from chromatin immunoprecipitation, mutation mapping and polymorphism discovery to noncoding RNA discovery. Here I survey next-generation sequencing technologies and consider how they can provide a more complete picture of how the genome shapes the organism.},
timestamp = {2016-06-16T16:07:57Z},
number = {3},
journaltitle = {Trends in genetics : TIG},
author = {Mardis, Elaine R},
date = {2008-03},
pages = {133--41},
keywords = {Animals,DNA,Forecasting,Genetics,Genetics: trends,Humans,Sequence Analysis},
eprinttype = {pmid},
eprint = {18262675}
}
@article{MatsenPplacer2010,
title = {Pplacer: Linear Time Maximum-Likelihood and {{Bayesian}} Phylogenetic Placement of Sequences onto a Fixed Reference Tree.},
volume = {11},
issn = {1471-2105},
doi = {10.1186/1471-2105-11-538},
abstract = {Likelihood-based phylogenetic inference is generally considered to be the most reliable classification method for unknown sequences. However, traditional likelihood-based phylogenetic methods cannot be applied to large volumes of short reads from next-generation sequencing due to computational complexity issues and lack of phylogenetic signal. "Phylogenetic placement," where a reference tree is fixed and the unknown query sequences are placed onto the tree via a reference alignment, is a way to bring the inferential power offered by likelihood-based approaches to large data sets.},
timestamp = {2016-06-16T16:07:58Z},
number = {1},
journaltitle = {BMC bioinformatics},
author = {Matsen, Frederick A and Kodner, Robin B and Armbrust, E Virginia},
date = {2010-01},
pages = {538},
keywords = {Base Sequence,Bayes Theorem,Computational Biology,Computational Biology: methods,Likelihood Functions,metagenomics,phylogenetics,Phylogeny,Sequence Alignment,Sequence Alignment: methods,Software},
eprinttype = {pmid},
eprint = {21034504}
}
@article{MavromatisUse2007,
title = {Use of Simulated Data Sets to Evaluate the Fidelity of Metagenomic Processing Methods.},
volume = {4},
issn = {1548-7091},
doi = {10.1038/nmeth1043},
abstract = {Metagenomics is a rapidly emerging field of research for studying microbial communities. To evaluate methods presently used to process metagenomic sequences, we constructed three simulated data sets of varying complexity by combining sequencing reads randomly selected from 113 isolate genomes. These data sets were designed to model real metagenomes in terms of complexity and phylogenetic composition. We assembled sampled reads using three commonly used genome assemblers (Phrap, Arachne and JAZZ), and predicted genes using two popular gene-finding pipelines (fgenesb and CRITICA/GLIMMER). The phylogenetic origins of the assembled contigs were predicted using one sequence similarity-based (blast hit distribution) and two sequence composition-based (PhyloPythia, oligonucleotide frequencies) binning methods. We explored the effects of the simulated community structure and method combinations on the fidelity of each processing step by comparison to the corresponding isolate genomes. The simulated data sets are available online to facilitate standardized benchmarking of tools for metagenomic analysis.},
timestamp = {2016-06-16T16:07:59Z},
number = {6},
journaltitle = {Nature methods},
author = {Mavromatis, Konstantinos and Ivanova, Natalia and Barry, Kerrie and Shapiro, Harris and Goltsman, Eugene and McHardy, Alice Carolyn and Rigoutsos, Isidore and Salamov, Asaf and Korzeniewski, Frank and Land, Miriam and Lapidus, Alla and Grigoriev, Igor and Richardson, Paul and Hugenholtz, Philip and Kyrpides, Nikos C},
date = {2007-06},
pages = {495--500},
keywords = {Bacterial,Bacterial: genetics,Cluster Analysis,Computational Biology,Computational Biology: methods,Computational Biology: standards,Computer Simulation,Databases,Genetic,Genome,Genomics,Genomics: methods,Genomics: standards,Phylogeny,Software},
eprinttype = {pmid},
eprint = {17468765}
}
@article{MchardyAccurate2007,
title = {Accurate Phylogenetic Classification of Variable-Length {{DNA}} Fragments.},
volume = {4},
issn = {1548-7091},
doi = {10.1038/nmeth976},
abstract = {Metagenome studies have retrieved vast amounts of sequence data from a variety of environments leading to new discoveries and insights into the uncultured microbial world. Except for very simple communities, the encountered diversity has made fragment assembly and the subsequent analysis a challenging problem. A taxonomic characterization of metagenomic fragments is required for a deeper understanding of shotgun-sequenced microbial communities, but success has mostly been limited to sequences containing phylogenetic marker genes. Here we present PhyloPythia, a composition-based classifier that combines higher-level generic clades from a set of 340 completed genomes with sample-derived population models. Extensive analyses on synthetic and real metagenome data sets showed that PhyloPythia allows the accurate classification of most sequence fragments across all considered taxonomic ranks, even for unknown organisms. The method requires no more than 100 kb of training sequence for the creation of accurate models of sample-specific populations and can assign fragments $\backslash$textgreateror=1 kb with high specificity.},
timestamp = {2016-06-16T16:07:59Z},
number = {1},
journaltitle = {Nature methods},
author = {McHardy, Alice Carolyn and Martín, Héctor García and Tsirigos, Aristotelis and Hugenholtz, Philip and Rigoutsos, Isidore},
date = {2007-01},
pages = {63--72},
keywords = {Animals,Archaea,Archaea: genetics,Arthropods,Arthropods: genetics,Ascomycota,Ascomycota: genetics,Bacteria,Bacteria: genetics,Chordata,Chordata: genetics,DNA,DNA: chemistry,DNA: classification,DNA: genetics,Eukaryotic Cells,Genome,Genomics,Genomics: methods,Industrial Waste,Phylogeny,Sargassum,Sargassum: microbiology,Software Validation},
eprinttype = {pmid},
eprint = {17179938}
}
@article{MetzkerSequencing2009,
title = {Sequencing Technologies — the next Generation},
volume = {11},
issn = {1471-0056},
doi = {10.1038/nrg2626},
timestamp = {2016-06-16T16:08:00Z},
number = {1},
journaltitle = {Nature reviews genetics},
author = {Metzker, Michael L.},
date = {2009-12},
pages = {31--46}
}
@article{MeyerMetagenomics2008,
title = {The Metagenomics {{RAST}} Server - a Public Resource for the Automatic Phylogenetic and Functional Analysis of Metagenomes.},
volume = {9},
issn = {1471-2105},
doi = {10.1186/1471-2105-9-386},
abstract = {BACKGROUND: Random community genomes (metagenomes) are now commonly used to study microbes in different environments. Over the past few years, the major challenge associated with metagenomics shifted from generating to analyzing sequences. High-throughput, low-cost next-generation sequencing has provided access to metagenomics to a wide range of researchers. RESULTS: A high-throughput pipeline has been constructed to provide high-performance computing to all researchers interested in using metagenomics. The pipeline produces automated functional assignments of sequences in the metagenome by comparing both protein and nucleotide databases. Phylogenetic and functional summaries of the metagenomes are generated, and tools for comparative metagenomics are incorporated into the standard views. User access is controlled to ensure data privacy, but the collaborative environment underpinning the service provides a framework for sharing datasets between multiple users. In the metagenomics RAST, all users retain full control of their data, and everything is available for download in a variety of formats. CONCLUSION: The open-source metagenomics RAST service provides a new paradigm for the annotation and analysis of metagenomes. With built-in support for multiple data sources and a back end that houses abstract data types, the metagenomics RAST is stable, extensible, and freely available to all researchers. This service has removed one of the primary bottlenecks in metagenome sequence analysis - the availability of high-performance computing for annotating the data. http://metagenomics.nmpdr.org.},
timestamp = {2016-06-16T16:08:02Z},
journaltitle = {BMC bioinformatics},
author = {Meyer, Folker and Paarmann, D and D'Souza, M and Olson, R and Glass, E M and Kubal, M and Paczian, T and Rodriguez, A and Stevens, R and Wilke, A and Wilkening, J and a Edwards, R},
date = {2008-01},
pages = {386},
keywords = {Algorithms,Database Management Systems,Databases,Genetic,Information Storage and Retrieval,Information Storage and Retrieval: methods,Internet,Phylogeny,Proteome,Proteome: genetics,Software,User-Computer Interface},
eprinttype = {pmid},
eprint = {18803844}
}
@article{MillerAssembly2010,
title = {Assembly Algorithms for Next-Generation Sequencing Data.},
volume = {95},
issn = {1089-8646},
doi = {10.1016/j.ygeno.2010.03.001},
abstract = {The emergence of next-generation sequencing platforms led to resurgence of research in whole-genome shotgun assembly algorithms and software. DNA sequencing data from the Roche 454, Illumina/Solexa, and ABI SOLiD platforms typically present shorter read lengths, higher coverage, and different error profiles compared with Sanger sequencing data. Since 2005, several assembly software packages have been created or revised specifically for de novo assembly of next-generation sequencing data. This review summarizes and compares the published descriptions of packages named SSAKE, SHARCGS, VCAKE, Newbler, Celera Assembler, Euler, Velvet, ABySS, AllPaths, and SOAPdenovo. More generally, it compares the two standard methods known as the de Bruijn graph approach and the overlap/layout/consensus approach to assembly.},
timestamp = {2016-06-16T16:08:02Z},
number = {6},
journaltitle = {Genomics},
author = {Miller, Jason R and Koren, Sergey and Sutton, Granger},
date = {2010-06},
pages = {315--27},
keywords = {Algorithms,DNA,DNA: methods,DNA: trends,Forecasting,Sequence Analysis,Software},
eprinttype = {pmid},
eprint = {20211242}
}
@article{MirarabSepp2012,
title = {{{SEPP}}: {{SATé}}-{{Enabled Phylogenetic Placement}}.},
issn = {1793-5091},
abstract = {We address the problem of Phylogenetic Placement, in which the objective is to insert short molecular sequences (called query sequences) into an existing phylogenetic tree and alignment on full-length sequences for the same gene. Phylogenetic placement has the potential to provide information beyond pure "species identification" (i.e., the association of metagenomic reads to existing species), because it can also give information about the evolutionary relationships between these query sequences and to known species. Approaches for phylogenetic placement have been developed that operate in two steps: first, an alignment is estimated for each query sequence to the alignment of the full-length sequences, and then that alignment is used to find the optimal location in the phylogenetic tree for the query sequence. Recent methods of this type include HMMALIGN+EPA, HMMALIGN+pplacer, and PaPaRa+EPA.We report on a study evaluating phylogenetic placement methods on biological and simulated data. This study shows that these methods have extremely good accuracy and computational tractability under conditions where the input contains a highly accurate alignment and tree for the full-length sequences, and the set of full-length sequences is sufficiently small and not too evolutionarily diverse; however, we also show that under other conditions accuracy declines and the computational requirements for memory and time exceed acceptable limits. We present SEPP, a general "boosting" technique to improve the accuracy and/or speed of phylogenetic placement techniques. The key algorithmic aspect of this booster is a dataset decomposition technique in SATé, a method that utilizes an iterative divide-and-conquer technique to co-estimate alignments and trees on large molecular sequence datasets. We show that SATé-boosting improves HMMALIGN+pplacer, placing short sequences more accurately when the set of input sequences has a large evolutionary diameter and produces placements of comparable accuracy in a fraction of the time for easier cases. SEPP software and the datasets used in this study are all available for free at http://www.cs.utexas.edu/users/phylo/software/sepp/submission.},
timestamp = {2016-06-16T16:08:02Z},
journaltitle = {Pacific Symposium on Biocomputing. Pacific Symposium on Biocomputing},
author = {Mirarab, S and Nguyen, N and Warnow, T},
date = {2012-01},
pages = {247--58},
keywords = {metagenomic analysis,phylogenetic placement},
eprinttype = {pmid},
eprint = {22174280}
}
@article{MonzoorulhaqueSortitems2009,
title = {{{SOrt}}-{{ITEMS}}: {{Sequence}} Orthology Based Approach for Improved Taxonomic Estimation of Metagenomic Sequences.},
volume = {25},
issn = {1367-4811},
doi = {10.1093/bioinformatics/btp317},
abstract = {MOTIVATION: One of the first steps in metagenomic analysis is the assignment of reads/contigs obtained from various sequencing technologies to their correct taxonomic bins. Similarity-based binning methods assign a read to a taxon/clade, based on the pattern of significant BLAST hits generated against sequence databases. Existing methods, which use bit-score as the sole parameter to ascertain the significance of BLAST hits, have limited specificity and accuracy of binning. A new binning algorithm, called SOrt-ITEMS is introduced, which addresses these limitations. The method uses alignment parameters besides the bit score to first identify an appropriate taxonomic level where the read can be assigned. An orthology-based approach is subsequently used by the method for the final assignment. RESULTS: The performance of SOrt-ITEMS has been validated with reads simulating sequences from 454 and Sanger sequencing technologies. In addition, the taxonomic composition of the Sargasso Sea data set has been analyzed using SOrt-ITEMS. SOrt-ITEMS shows improved specificity and accuracy of assignments especially in simulated scenarios, wherein sequences corresponding to the source organism of the reads are absent in the reference database. AVAILABILITY: SOrt-ITEMS software is available for download from: http://metagenomics.atc.tcs.com/binning/SOrt-ITEMS. No license is needed for academic and nonprofit use.},
timestamp = {2016-06-16T16:08:03Z},
number = {14},
journaltitle = {Bioinformatics (Oxford, England)},
author = {Monzoorul Haque, M and Ghosh, Tarini Shankar and Komanduri, Dinakar and Mande, Sharmila S},
date = {2009-07},
pages = {1722--30},
keywords = {Algorithms,Base Sequence,Classification,Classification: methods,Databases,Genetic,Genomics,Genomics: methods,Phylogeny,Sequence Alignment,Software},
eprinttype = {pmid},
eprint = {19439565}
}
@article{NelsonCatalog2010,
title = {A Catalog of Reference Genomes from the Human Microbiome.},
volume = {328},
issn = {1095-9203},
doi = {10.1126/science.1183605},
abstract = {The human microbiome refers to the community of microorganisms, including prokaryotes, viruses, and microbial eukaryotes, that populate the human body. The National Institutes of Health launched an initiative that focuses on describing the diversity of microbial species that are associated with health and disease. The first phase of this initiative includes the sequencing of hundreds of microbial reference genomes, coupled to metagenomic sequencing from multiple body sites. Here we present results from an initial reference genome sequencing of 178 microbial genomes. From 547,968 predicted polypeptides that correspond to the gene complement of these strains, previously unidentified ("novel") polypeptides that had both unmasked sequence length greater than 100 amino acids and no BLASTP match to any nonreference entry in the nonredundant subset were defined. This analysis resulted in a set of 30,867 polypeptides, of which 29,987 (approximately 97\%) were unique. In addition, this set of microbial genomes allows for approximately 40\% of random sequences from the microbiome of the gastrointestinal tract to be associated with organisms based on the match criteria used. Insights into pan-genome analysis suggest that we are still far from saturating microbial species genetic data sets. In addition, the associated metrics and standards used by our group for quality assurance are presented.},
timestamp = {2016-06-16T16:08:06Z},
number = {5981},
journaltitle = {Science (New York, N.Y.)},
author = {Nelson, Karen E and Weinstock, George M and Highlander, Sarah K and Worley, Kim C and Creasy, Heather Huot and Wortman, Jennifer Russo and Rusch, Douglas B and Mitreva, Makedonka and Sodergren, Erica and Chinwalla, Asif T and Feldgarden, Michael and Gevers, Dirk and Haas, Brian J and Madupu, Ramana and Ward, Doyle V and Birren, Bruce W and a Gibbs, Richard and Methe, Barbara and Petrosino, Joseph F and Strausberg, Robert L and Sutton, Granger G and White, Owen R and Wilson, Richard K and Durkin, Scott and Giglio, Michelle Gwinn and Gujja, Sharvari and Howarth, Clint and Kodira, Chinnappa D and Kyrpides, Nikos and Mehta, Teena and Muzny, Donna M and Pearson, Matthew and Pepin, Kymberlie and Pati, Amrita and Qin, Xiang and Yandava, Chandri and Zeng, Qiandong and Zhang, Lan and Berlin, Aaron M and Chen, Lei and a Hepburn, Theresa and Johnson, Justin and McCorrison, Jamison and Miller, Jason and Minx, Pat and Nusbaum, Chad and Russ, Carsten and Sykes, Sean M and Tomlinson, Chad M and Young, Sarah and Warren, Wesley C and Badger, Jonathan and Crabtree, Jonathan and Markowitz, Victor M and Orvis, Joshua and Cree, Andrew and Ferriera, Steve and Fulton, Lucinda L and Fulton, Robert S and Gillis, Marcus and Hemphill, Lisa D and Joshi, Vandita and Kovar, Christie and Torralba, Manolito and a Wetterstrand, Kris and Abouellleil, Amr and Wollam, Aye M and Buhay, Christian J and Ding, Yan and Dugan, Shannon and FitzGerald, Michael G and Holder, Mike and Hostetler, Jessica and Clifton, Sandra W and Allen-Vercoe, Emma and Earl, Ashlee M and Farmer, Candace N and Liolios, Konstantinos and Surette, Michael G and Xu, Qiang and Pohl, Craig and Wilczek-Boney, Katarzyna and Zhu, Dianhui},
date = {2010-05},
pages = {994--9},
keywords = {Archaeal,Bacteria,Bacteria: classification,Bacteria: genetics,Bacterial,Bacterial Proteins,Bacterial Proteins: chemistry,Bacterial Proteins: genetics,Biodiversity,Computational Biology,Databases,DNA,DNA: standards,Gastrointestinal Tract,Gastrointestinal Tract: microbiology,Genes,Genetic,Genetic Variation,Genome,Humans,Metagenome,Metagenome: genetics,metagenomics,Metagenomics: methods,Metagenomics: standards,Mouth,Mouth: microbiology,Peptides,Peptides: chemistry,Peptides: genetics,Phylogeny,Respiratory System,Respiratory System: microbiology,Sequence Analysis,Skin,Skin: microbiology,Urogenital System,Urogenital System: microbiology},
eprinttype = {pmid},
eprint = {20489017}
}
@article{PatilTaxonomic2011,
title = {Taxonomic Metagenome Sequence Assignment with Structured Output Models},
volume = {8},
issn = {1548-7091},
doi = {10.1038/nmeth0311-191},
timestamp = {2016-06-16T16:08:11Z},
number = {3},
journaltitle = {Nature Methods},
author = {Patil, Kaustubh R and Haider, Peter and Pope, Phillip B and Turnbaugh, Peter J and Morrison, Mark and Scheffer, Tobias and McHardy, Alice Carolyn},
date = {2011-03},
pages = {191--192}
}
@article{PellScaling2012,
title = {Scaling Metagenome Sequence Assembly with Probabilistic de {{Bruijn}} Graphs},
volume = {I},
doi = {10.1073/pnas.1121464109},
abstract = {The memory requirements for de novo assembly of short-read shotgun sequencing data from complex microbial populations are an increasingly large practical barrier to environmental studies. Here we introduce a memory-efficient graph representation with which we can analyze the k-mer connectivity of metagenomic samples, allowing us to reduce the size of the de novo assembly process for metagenomes with a "divide and conquer" algorithm. This graph representation is based on a probabilistic data structure, a Bloom filter, that allows us to store assembly graphs in as little as 4 bits per k-mer. We use this approach to achieve a 20-fold decrease in memory for the assembly of a soil metagenome sample.},
timestamp = {2017-02-21T14:57:39Z},
number = {1},
journaltitle = {Arxiv preprint arXiv:1112.4193},
author = {Pell, Jason and Hintze, Arend and Canino-Koning, Rosangela and Howe, Adina and Tiedje, J.M. and Brown, C.T.},
date = {2012-06-30},
pages = {1--11}
}
@article{PengMetaidba2011,
title = {Meta-{{IDBA}}: A de Novo Assembler for Metagenomic Data.},
volume = {27},
issn = {1367-4811},
doi = {10.1093/bioinformatics/btr216},
abstract = {Next-generation sequencing techniques allow us to generate reads from a microbial environment in order to analyze the microbial community. However, assembling of a set of mixed reads from different species to form contigs is a bottleneck of metagenomic research. Although there are many assemblers for assembling reads from a single genome, there are no assemblers for assembling reads in metagenomic data without reference genome sequences. Moreover, the performances of these assemblers on metagenomic data are far from satisfactory, because of the existence of common regions in the genomes of subspecies and species, which make the assembly problem much more complicated.},
timestamp = {2016-06-16T16:08:12Z},
number = {13},
journaltitle = {Bioinformatics (Oxford, England)},
author = {Peng, Yu and Leung, Henry C M and Yiu, S M and Chin, Francis Y L},
date = {2011-07},
pages = {i94--i101},
eprinttype = {pmid},
eprint = {21685107}
}
@article{PopeAdaptation2010,
title = {Adaptation to Herbivory by the {{Tammar}} Wallaby Includes Bacterial and Glycoside Hydrolase Profiles Different from Other Herbivores.},
volume = {107},
issn = {1091-6490},
doi = {10.1073/pnas.1005297107},
abstract = {Metagenomic and bioinformatic approaches were used to characterize plant biomass conversion within the foregut microbiome of Australia's "model" marsupial, the Tammar wallaby (Macropus eugenii). Like the termite hindgut and bovine rumen, key enzymes and modular structures characteristic of the "free enzyme" and "cellulosome" paradigms of cellulose solubilization remain either poorly represented or elusive to capture by shotgun sequencing methods. Instead, multigene polysaccharide utilization loci-like systems coupled with genes encoding beta-1,4-endoglucanases and beta-1,4-endoxylanases–which have not been previously encountered in metagenomic datasets–were identified, as were a diverse set of glycoside hydrolases targeting noncellulosic polysaccharides. Furthermore, both rrs gene and other phylogenetic analyses confirmed that unique clades of the Lachnospiraceae, Bacteroidales, and Gammaproteobacteria are predominant in the Tammar foregut microbiome. Nucleotide composition-based sequence binning facilitated the assemblage of more than two megabase pairs of genomic sequence for one of the novel Lachnospiraceae clades (WG-2). These analyses show that WG-2 possesses numerous glycoside hydrolases targeting noncellulosic polysaccharides. These collective data demonstrate that Australian macropods not only harbor unique bacterial lineages underpinning plant biomass conversion, but their repertoire of glycoside hydrolases is distinct from those of the microbiomes of higher termites and the bovine rumen.},
timestamp = {2016-06-16T16:08:13Z},
number = {33},
journaltitle = {Proceedings of the National Academy of Sciences of the United States of America},
author = {Pope, P B and Denman, S E and Jones, M and Tringe, S G and Barry, K and a Malfatti, S and McHardy, a C and Cheng, J-F and Hugenholtz, P and McSweeney, C S and Morrison, M},
date = {2010-08},
pages = {14793--8},
keywords = {16S,16S: genetics,Adaptation,Animals,Bacteria,Bacteria: classification,Bacteria: genetics,Bacteria: metabolism,Cellulosomes,Cellulosomes: metabolism,DNA,Gastrointestinal Tract,Gastrointestinal Tract: microbiology,Glycoside Hydrolases,Glycoside Hydrolases: classification,Glycoside Hydrolases: genetics,Glycoside Hydrolases: metabolism,Macropodidae,Macropodidae: genetics,Macropodidae: microbiology,Macropodidae: physiology,Metagenome,Metagenome: genetics,metagenomics,Metagenomics: methods,Molecular Sequence Data,Phylogeny,Physiological,Physiological: genetics,Physiological: physiology,Plants,Plants: metabolism,Ribosomal,RNA,Seasons,Sequence Analysis},
eprinttype = {pmid},
eprint = {20668243}
}
@article{PopeIsolation2011,
title = {Isolation of {{Succinivibrionaceae}} Implicated in Low Methane Emissions from {{Tammar}} Wallabies.},
volume = {333},
issn = {1095-9203},
doi = {10.1126/science.1205760},
abstract = {The Tammar wallaby (Macropus eugenii) harbors unique gut bacteria and produces only one-fifth the amount of methane produced by ruminants per unit of digestible energy intake. We have isolated a dominant bacterial species (WG-1) from the wallaby microbiota affiliated with the family Succinivibrionaceae and implicated in lower methane emissions from starch-containing diets. This was achieved by using a partial reconstruction of the bacterium's metabolism from binned metagenomic data (nitrogen and carbohydrate utilization pathways and antibiotic resistance) to devise cultivation-based strategies that produced axenic WG-1 cultures. Pure-culture studies confirm that the bacterium is capnophilic and produces succinate, further explaining a microbiological basis for lower methane emissions from macropodids. This knowledge also provides new strategic targets for redirecting fermentation and reducing methane production in livestock.},
timestamp = {2016-06-16T16:08:14Z},
number = {6042},
journaltitle = {Science (New York, N.Y.)},
author = {Pope, P B and Smith, W and Denman, S E and Tringe, S G and Barry, K and Hugenholtz, P and McSweeney, C S and McHardy, a C and Morrison, M},
date = {2011-07},
pages = {646--8},
keywords = {Animals,Bacterial,Carbohydrate Metabolism,Digestive System,Digestive System: microbiology,Female,Fermentation,Genome,Macropodidae,Macropodidae: microbiology,Metagenome,Methane,Methane: metabolism,Molecular Sequence Data,Starch,Starch: metabolism,Succinic Acid,Succinic Acid: metabolism,Succinivibrionaceae,Succinivibrionaceae: genetics,Succinivibrionaceae: growth & development,Succinivibrionaceae: isolation & purification,Succinivibrionaceae: metabolism},
eprinttype = {pmid},
eprint = {21719642}
}
@article{PrideGenome2008,
title = {Genome Signature Analysis of Thermal Virus Metagenomes Reveals {{Archaea}} and Thermophilic Signatures.},
volume = {9},
issn = {1471-2164},
doi = {10.1186/1471-2164-9-420},
abstract = {Metagenomic analysis provides a rich source of biological information for otherwise intractable viral communities. However, study of viral metagenomes has been hampered by its nearly complete reliance on BLAST algorithms for identification of DNA sequences. We sought to develop algorithms for examination of viral metagenomes to identify the origin of sequences independent of BLAST algorithms. We chose viral metagenomes obtained from two hot springs, Bear Paw and Octopus, in Yellowstone National Park, as they represent simple microbial populations where comparatively large contigs were obtained. Thermal spring metagenomes have high proportions of sequences without significant Genbank homology, which has hampered identification of viruses and their linkage with hosts. To analyze each metagenome, we developed a method to classify DNA fragments using genome signature-based phylogenetic classification (GSPC), where metagenomic fragments are compared to a database of oligonucleotide signatures for all previously sequenced Bacteria, Archaea, and viruses.},
timestamp = {2016-06-16T16:08:16Z},
journaltitle = {BMC genomics},
author = {Pride, David T and Schoenfeld, Thomas},
date = {2008-01},
pages = {420},
keywords = {Archaea,Archaeal Viruses,Archaeal Viruses: classification,Archaeal Viruses: genetics,Archaea: virology,Genomics,Genomics: methods,Hot Springs,Hot Springs: virology,Northwestern United States},
eprinttype = {pmid},
eprint = {18798991}
}
@article{PruesseSilva2007,
title = {{{SILVA}}: A Comprehensive Online Resource for Quality Checked and Aligned Ribosomal {{RNA}} Sequence Data Compatible with {{ARB}}.},
volume = {35},
issn = {1362-4962},
doi = {10.1093/nar/gkm864},
abstract = {Sequencing ribosomal RNA (rRNA) genes is currently the method of choice for phylogenetic reconstruction, nucleic acid based detection and quantification of microbial diversity. The ARB software suite with its corresponding rRNA datasets has been accepted by researchers worldwide as a standard tool for large scale rRNA analysis. However, the rapid increase of publicly available rRNA sequence data has recently hampered the maintenance of comprehensive and curated rRNA knowledge databases. A new system, SILVA (from Latin silva, forest), was implemented to provide a central comprehensive web resource for up to date, quality controlled databases of aligned rRNA sequences from the Bacteria, Archaea and Eukarya domains. All sequences are checked for anomalies, carry a rich set of sequence associated contextual information, have multiple taxonomic classifications, and the latest validly described nomenclature. Furthermore, two precompiled sequence datasets compatible with ARB are offered for download on the SILVA website: (i) the reference (Ref) datasets, comprising only high quality, nearly full length sequences suitable for in-depth phylogenetic analysis and probe design and (ii) the comprehensive Parc datasets with all publicly available rRNA sequences longer than 300 nucleotides suitable for biodiversity analyses. The latest publicly available database release 91 (August 2007) hosts 547 521 sequences split into 461 823 small subunit and 85 689 large subunit rRNAs.},
timestamp = {2016-06-16T16:08:16Z},
number = {21},
journaltitle = {Nucleic acids research},
author = {Pruesse, Elmar and Quast, Christian and Knittel, Katrin and Fuchs, Bernhard M and Ludwig, Wolfgang and Peplies, Jörg and Glöckner, Frank Oliver},
date = {2007-01},
pages = {7188--96},
keywords = {Base Sequence,Databases,Genes,Internet,Nucleic Acid,Nucleic Acid: standards,Phylogeny,Quality Control,Ribosomal,Ribosomal: genetics,RNA,rRNA,Sequence Alignment,Sequence Analysis,Software},
eprinttype = {pmid},
eprint = {17947321}
}
@article{QinHuman2010,
title = {A Human Gut Microbial Gene Catalogue Established by Metagenomic Sequencing.},
volume = {464},
issn = {1476-4687},
doi = {10.1038/nature08821},
abstract = {To understand the impact of gut microbes on human health and well-being it is crucial to assess their genetic potential. Here we describe the Illumina-based metagenomic sequencing, assembly and characterization of 3.3 million non-redundant microbial genes, derived from 576.7 gigabases of sequence, from faecal samples of 124 European individuals. The gene set, approximately 150 times larger than the human gene complement, contains an overwhelming majority of the prevalent (more frequent) microbial genes of the cohort and probably includes a large proportion of the prevalent human intestinal microbial genes. The genes are largely shared among individuals of the cohort. Over 99\% of the genes are bacterial, indicating that the entire cohort harbours between 1,000 and 1,150 prevalent bacterial species and each individual at least 160 such species, which are also largely shared. We define and describe the minimal gut metagenome and the minimal gut bacterial genome in terms of functions present in all individuals and most bacteria, respectively.},
timestamp = {2016-06-16T16:08:16Z},
number = {7285},
journaltitle = {Nature},
author = {Qin, Junjie and Li, Ruiqiang and Raes, Jeroen and Arumugam, Manimozhiyan and Burgdorf, Kristoffer Solvsten and Manichanh, Chaysavanh and Nielsen, Trine and Pons, Nicolas and Levenez, Florence and Yamada, Takuji and Mende, Daniel R and Li, Junhua and Xu, Junming and Li, Shaochuan and Li, Dongfang and Cao, Jianjun and Wang, Bo and Liang, Huiqing and Zheng, Huisong and Xie, Yinlong and Tap, Julien and Lepage, Patricia and Bertalan, Marcelo and Batto, Jean-Michel and Hansen, Torben and Le Paslier, Denis and Linneberg, Allan and Nielsen, H Bjørn and Pelletier, Eric and Renault, Pierre and Sicheritz-Ponten, Thomas and Turner, Keith and Zhu, Hongmei and Yu, Chang and Li, Shengting and Jian, Min and Zhou, Yan and Li, Yingrui and Zhang, Xiuqing and Li, Songgang and Qin, Nan and Yang, Huanming and Wang, Jian and Brunak, Søren and Doré, Joel and Guarner, Francisco and Kristiansen, Karsten and Pedersen, Oluf and Parkhill, Julian and Weissenbach, Jean and Bork, Peer and Ehrlich, S Dusko and Wang, Jun},
date = {2010-03},
pages = {59--65},
keywords = {Adult,Bacteria,Bacteria: classification,Bacteria: genetics,Bacteria: isolation & purification,Bacterial,Bacterial: genetics,Bacteria: metabolism,Cohort Studies,Contig Mapping,Denmark,DNA,Essential,Essential: genetics,Feces,Feces: microbiology,Gastrointestinal Tract,Gastrointestinal Tract: microbiology,Genes,Genome,Genomics,Health,Humans,Inflammatory Bowel Diseases,Inflammatory Bowel Diseases: genetics,Metagenome,Metagenome: genetics,Obesity,Obesity: genetics,Open Reading Frames,Open Reading Frames: genetics,Overweight,Overweight: genetics,Sequence Analysis,Spain},
eprinttype = {pmid},
eprint = {20203603}
}
@article{QuinceRational2008,
title = {The Rational Exploration of Microbial Diversity.},
volume = {2},
issn = {1751-7370},
doi = {10.1038/ismej.2008.69},
abstract = {The exploration of the microbial world has been an exciting series of unanticipated discoveries despite being largely uninformed by rational estimates of the magnitude of task confronting us. However, in the long term, more structured surveys can be achieved by estimating the diversity of microbial communities and the effort required to describe them. The rates of recovery of new microbial taxa in very large samples suggest that many more taxa remain to be discovered in soils and the oceans. We apply a robust statistical method to large gene sequence libraries from these environments to estimate both diversity and the sequencing effort required to obtain a given fraction of that diversity. In the upper ocean, we predict some 1400 phylotypes, and a mere fivefold increase in shotgun reads could yield 90\% of the metagenome, that is, all genes from all taxa. However, at deep ocean, hydrothermal vents and diversities in soils can be up to two orders of magnitude larger, and hundreds of times the current number of samples will be required just to obtain 90\% of the taxonomic diversity based on 3\% difference in 16S rDNA. Obtaining 90\% of the metagenome will require tens of thousands of times the current sequencing effort. Although the definitive sequencing of hyperdiverse environments is not yet possible, we can, using taxa-abundance distributions, begin to plan and develop the required methods and strategies. This would initiate a new phase in the exploration of the microbial world.},
timestamp = {2016-06-16T16:08:17Z},
number = {10},
journaltitle = {The ISME journal},
author = {Quince, Christopher and Curtis, Thomas P and Sloan, William T},
date = {2008-10},
pages = {997--1006},
keywords = {16S,16S: genetics,Bacteria,Bacteria: genetics,Bacteria: isolation & purification,Bacterial,Bacterial: genetics,Biodiversity,DNA,Genome,Models,Ribosomal,Ribosomal: genetics,RNA,Seawater,Seawater: microbiology,Sequence Analysis,Soil Microbiology,Statistical},
eprinttype = {pmid},
eprint = {18650928}
}
@article{QuinceAccurate2009,
title = {Accurate Determination of Microbial Diversity from 454 Pyrosequencing Data.},
volume = {6},
issn = {1548-7105},
doi = {10.1038/nmeth.1361},
abstract = {We present an algorithm, PyroNoise, that clusters the flowgrams of 454 pyrosequencing reads using a distance measure that models sequencing noise. This infers the true sequences in a collection of amplicons. We pyrosequenced a known mixture of microbial 16S rDNA sequences extracted from a lake and found that without noise reduction the number of operational taxonomic units is overestimated but using PyroNoise it can be accurately calculated.},
timestamp = {2016-06-16T16:08:17Z},
number = {9},
journaltitle = {Nature methods},
author = {Quince, Christopher and Lanzén, Anders and Curtis, Thomas P and Davenport, Russell J and Hall, Neil and Head, Ian M and Read, L Fiona and Sloan, William T},
date = {2009-09},