[med-svn] [r-bioc-phyloseq] 09/11: New upstream version 1.19.1
Andreas Tille
tille at debian.org
Fri Sep 22 20:52:52 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository r-bioc-phyloseq.
commit bab08608dc7b5e56e42d1d32c8e3908d6dc6fdb8
Author: Andreas Tille <tille at debian.org>
Date: Fri Sep 22 22:47:44 2017 +0200
New upstream version 1.19.1
---
DESCRIPTION | 41 +
NAMESPACE | 272 ++
R/IO-methods.R | 2431 +++++++++++++++
R/allClasses.R | 295 ++
R/allData.R | 213 ++
R/allPackage.R | 24 +
R/almostAllAccessors.R | 554 ++++
R/as-methods.R | 34 +
R/assignment-methods.R | 375 +++
R/deprecated_functions.R | 130 +
R/distance-methods.R | 690 +++++
R/extend_DESeq2.R | 72 +
R/extend_metagenomeSeq.R | 68 +
R/extend_vegan.R | 274 ++
R/extract-methods.R | 74 +
R/merge-methods.R | 600 ++++
R/multtest-wrapper.R | 173 ++
R/network-methods.R | 171 ++
R/ordination-methods.R | 642 ++++
R/otuTable-class.R | 145 +
R/phylo-class.R | 26 +
R/phyloseq-class.R | 400 +++
R/plot-methods.R | 2857 ++++++++++++++++++
R/sampleData-class.R | 149 +
R/show-methods.R | 82 +
R/taxonomyTable-class.R | 132 +
R/transform_filter-methods.R | 1187 ++++++++
R/validity-methods.R | 117 +
README.html | 102 +
README.md | 54 +
TODO.txt | 5 +
build/vignette.rds | Bin 0 -> 349 bytes
data/GlobalPatterns.RData | Bin 0 -> 435652 bytes
data/datalist | 4 +
data/enterotype.RData | Bin 0 -> 195260 bytes
data/esophagus.RData | Bin 0 -> 1840 bytes
data/soilrep.RData | Bin 0 -> 107392 bytes
debian/README.test | 8 -
debian/changelog | 34 -
debian/compat | 1 -
debian/control | 34 -
debian/copyright | 675 -----
debian/docs | 3 -
debian/patches/fix_r-cran-ade4_versioning.patch | 21 -
debian/patches/series | 1 -
debian/rules | 4 -
debian/source/format | 1 -
debian/tests/control | 3 -
debian/tests/run-unit-test | 13 -
debian/watch | 3 -
inst/CITATION | 13 +
inst/NEWS | 1659 +++++++++++
inst/doc/Unweighted_UniFrac.RData | Bin 0 -> 2785 bytes
inst/doc/phyloseq-FAQ.R | 31 +
inst/doc/phyloseq-FAQ.Rmd | 562 ++++
inst/doc/phyloseq-FAQ.html | 347 +++
inst/doc/phyloseq-analysis.R | 208 ++
inst/doc/phyloseq-analysis.Rmd | 506 ++++
inst/doc/phyloseq-analysis.html | 463 +++
inst/doc/phyloseq-basics.R | 112 +
inst/doc/phyloseq-basics.Rmd | 600 ++++
inst/doc/phyloseq-basics.html | 509 ++++
inst/doc/phyloseq-mixture-models.R | 74 +
inst/doc/phyloseq-mixture-models.Rmd | 191 ++
inst/doc/phyloseq-mixture-models.html | 302 ++
inst/extdata/GP_otu_table_rand_short.txt.gz | Bin 0 -> 13316 bytes
inst/extdata/GP_tree_rand_short.newick.gz | Bin 0 -> 7183 bytes
inst/extdata/biom-refseq.fasta | 29 +
inst/extdata/biom-tree.phy | 1 +
inst/extdata/esophagus.fn.list.gz | Bin 0 -> 4578 bytes
inst/extdata/esophagus.fn.shared.gz | Bin 0 -> 2362 bytes
inst/extdata/esophagus.good.groups.gz | Bin 0 -> 1347 bytes
inst/extdata/esophagus.tree.gz | Bin 0 -> 4989 bytes
inst/extdata/gg13-5-73.tree.gz | Bin 0 -> 3512 bytes
inst/extdata/gp500-pycogent.py | 19 +
inst/extdata/gp500-uuf.csv | 28 +
inst/extdata/gp500-wuf.csv | 28 +
inst/extdata/gp500-wufu.csv | 28 +
inst/extdata/gp500test.env.txt | 3093 ++++++++++++++++++++
inst/extdata/gp500test.tree | 1 +
inst/extdata/master_map.txt | 29 +
inst/extdata/min_dense_otu_table.biom | 32 +
inst/extdata/min_sparse_otu_table.biom | 43 +
inst/extdata/mothur_example.cons.taxonomy.gz | Bin 0 -> 565 bytes
inst/extdata/qiime500-refseq.fasta | 3070 +++++++++++++++++++
inst/extdata/rformat_dist_0.03.txt.gz | Bin 0 -> 26684 bytes
inst/extdata/rich_dense_otu_table.biom | 56 +
inst/extdata/rich_sparse_otu_table.biom | 66 +
.../study_1457_split_library_seqs_and_mapping.zip | Bin 0 -> 161176 bytes
...study_816_split_library_seqs_and_mapping.tar.gz | Bin 0 -> 4991 bytes
.../study_816_split_library_seqs_and_mapping.zip | Bin 0 -> 5191 bytes
inst/extdata/study_gp.txt | 29 +
inst/extdata/usearch.uc | 100 +
inst/scripts/installer.R | 49 +
man/DPCoA.Rd | 91 +
man/JSD.Rd | 60 +
man/UniFrac-methods.Rd | 174 ++
man/access.Rd | 49 +
man/assign-otu_table.Rd | 46 +
man/assign-phy_tree.Rd | 35 +
man/assign-sample_data.Rd | 48 +
man/assign-sample_names.Rd | 51 +
man/assign-tax_table.Rd | 53 +
man/assign-taxa_are_rows.Rd | 33 +
man/assign-taxa_names.Rd | 57 +
man/build_tax_table.Rd | 43 +
man/capscale-phyloseq-methods.Rd | 70 +
man/cca-rda-phyloseq-methods.Rd | 60 +
man/chunkReOrder.Rd | 35 +
man/data-GlobalPatterns.Rd | 50 +
man/data-enterotype.Rd | 48 +
man/data-esophagus.Rd | 49 +
man/data-soilrep.Rd | 84 +
man/decorana.Rd | 27 +
man/dist-class.Rd | 13 +
man/distance.Rd | 110 +
man/distanceMethodList.Rd | 76 +
man/envHash2otu_table.Rd | 47 +
man/estimate_richness.Rd | 64 +
man/export_env_file.Rd | 36 +
man/export_mothur_dist.Rd | 40 +
man/extract-methods.Rd | 80 +
man/filter_taxa.Rd | 55 +
man/filterfun_sample.Rd | 37 +
man/fix_phylo.Rd | 18 +
man/gapstat_ord.Rd | 70 +
man/genefilter_sample-methods.Rd | 73 +
man/get.component.classes.Rd | 22 +
man/get_sample-methods.Rd | 40 +
man/get_taxa-methods.Rd | 41 +
man/get_taxa_unique.Rd | 39 +
man/get_variable.Rd | 35 +
man/getslots.phyloseq.Rd | 34 +
man/import.Rd | 73 +
man/import_RDP_cluster.Rd | 49 +
man/import_RDP_otu.Rd | 44 +
man/import_biom.Rd | 171 ++
man/import_env_file.Rd | 42 +
man/import_mothur.Rd | 121 +
man/import_mothur_constaxonomy.Rd | 36 +
man/import_mothur_dist.Rd | 32 +
man/import_mothur_groups.Rd | 32 +
man/import_mothur_otu_table.Rd | 41 +
man/import_mothur_otulist.Rd | 43 +
man/import_mothur_shared.Rd | 24 +
man/import_pyrotagger_tab.Rd | 67 +
man/import_qiime.Rd | 156 +
man/import_qiime_otu_tax.Rd | 83 +
man/import_qiime_sample_data.Rd | 49 +
man/import_uparse.Rd | 69 +
man/import_usearch_uc.Rd | 89 +
man/index_reorder.Rd | 30 +
man/intersect_taxa.Rd | 33 +
man/make_network.Rd | 97 +
man/merge_phyloseq.Rd | 67 +
man/merge_phyloseq_pair-methods.Rd | 79 +
man/merge_samples-methods.Rd | 73 +
man/merge_taxa-methods.Rd | 78 +
man/metaMDS.Rd | 27 +
man/microbio_me_qiime.Rd | 99 +
man/mt-methods.Rd | 93 +
man/nodeplotblank.Rd | 40 +
man/nodeplotboot.Rd | 61 +
man/nodeplotdefault.Rd | 48 +
man/nsamples-methods.Rd | 45 +
man/ntaxa-methods.Rd | 48 +
man/ordinate.Rd | 156 +
man/otu_table-class.Rd | 24 +
man/otu_table-methods.Rd | 59 +
man/parseTaxonomy-functions.Rd | 71 +
man/pcoa.Rd | 20 +
man/phy_tree-methods.Rd | 58 +
man/phylo-class.Rd | 14 +
man/phylo.Rd | 36 +
man/phyloseq-class.Rd | 51 +
man/phyloseq-deprecated.Rd | 111 +
man/phyloseq-package.Rd | 28 +
man/phyloseq.Rd | 49 +
man/phyloseq_to_deseq2.Rd | 62 +
man/phyloseq_to_metagenomeSeq.Rd | 42 +
man/plot_bar.Rd | 74 +
man/plot_clusgap.Rd | 59 +
man/plot_heatmap.Rd | 206 ++
man/plot_net.Rd | 117 +
man/plot_network.Rd | 112 +
man/plot_ordination.Rd | 114 +
man/plot_phyloseq-methods.Rd | 48 +
man/plot_richness.Rd | 134 +
man/plot_scree.Rd | 55 +
man/plot_tree.Rd | 185 ++
man/prune_samples-methods.Rd | 50 +
man/prune_taxa-methods.Rd | 71 +
man/psmelt.Rd | 73 +
man/rank_names.Rd | 33 +
man/rarefy_even_depth.Rd | 126 +
man/read_tree.Rd | 51 +
man/read_tree_greengenes.Rd | 57 +
man/reconcile_categories.Rd | 35 +
man/refseq-methods.Rd | 49 +
man/rm_outlierf.Rd | 40 +
man/sample_data-class.Rd | 25 +
man/sample_data-methods.Rd | 53 +
man/sample_names-methods.Rd | 39 +
man/sample_sums.Rd | 31 +
man/sample_variables.Rd | 33 +
man/show-methods.Rd | 34 +
man/show_mothur_cutoffs.Rd | 29 +
man/splat.phyloseq.objects.Rd | 36 +
man/subset_ord_plot.Rd | 70 +
man/subset_samples-methods.Rd | 43 +
man/subset_taxa-methods.Rd | 41 +
man/tax_glom.Rd | 73 +
man/tax_table-methods.Rd | 57 +
man/taxa_are_rows-methods.Rd | 31 +
man/taxa_names-methods.Rd | 55 +
man/taxa_sums.Rd | 30 +
man/taxonomyTable-class.Rd | 16 +
man/threshrank.Rd | 43 +
man/threshrankfun.Rd | 38 +
man/tip_glom.Rd | 70 +
man/topf.Rd | 42 +
man/topk.Rd | 36 +
man/topp.Rd | 38 +
man/transformcounts.Rd | 55 +
man/transpose-methods.Rd | 32 +
man/tree_layout.Rd | 67 +
tests/testthat-phyloseq.R | 13 +
tests/testthat/test-IO.R | 379 +++
tests/testthat/test-distance.R | 118 +
tests/testthat/test-merge.R | 305 ++
tests/testthat/test-phyloseq.R | 140 +
tests/testthat/test-plot.R | 500 ++++
tests/testthat/test-rarefy.R | 61 +
tests/testthat/test-subset.R | 110 +
tests/testthat/test-transform.R | 100 +
vignettes/import_qiime_directory_structure.jpg | Bin 0 -> 24538 bytes
vignettes/phyloseq-FAQ.Rmd | 562 ++++
vignettes/phyloseq-analysis.Rmd | 506 ++++
vignettes/phyloseq-basics.Rmd | 600 ++++
vignettes/phyloseq-mixture-models.Rmd | 191 ++
vignettes/phyloseq_classes_7.png | Bin 0 -> 294387 bytes
241 files changed, 36075 insertions(+), 801 deletions(-)
diff --git a/DESCRIPTION b/DESCRIPTION
new file mode 100644
index 0000000..f0289b8
--- /dev/null
+++ b/DESCRIPTION
@@ -0,0 +1,41 @@
+Package: phyloseq
+Version: 1.19.1
+Date: 2016-12-29
+Title: Handling and analysis of high-throughput microbiome census data
+Description: phyloseq provides a set of classes and tools
+ to facilitate the import, storage, analysis, and
+ graphical display of microbiome census data.
+Maintainer: Paul J. McMurdie <joey711 at gmail.com>
+Author: Paul J. McMurdie <joey711 at gmail.com>,
+ Susan Holmes <susan at stat.stanford.edu>, with
+ contributions from Gregory Jordan and Scott Chamberlain
+License: AGPL-3
+Imports: BiocGenerics (>= 0.18.0), ade4 (>= 1.7.4), ape (>= 3.4),
+ biomformat (>= 1.0.0), Biostrings (>= 2.40.0), cluster (>=
+ 2.0.4), data.table (>= 1.9.6), foreach (>= 1.4.3), ggplot2 (>=
+ 2.1.0), igraph (>= 1.0.1), methods (>= 3.3.0), multtest (>=
+ 2.28.0), plyr (>= 1.8.3), reshape2 (>= 1.4.1), scales (>=
+ 0.4.0), vegan (>= 2.3.5), Biobase
+Depends: R (>= 3.3.0)
+Suggests: BiocStyle (>= 2.0.0), DESeq2 (>= 1.12.0), genefilter (>=
+ 1.54), testthat (>= 1.0.2), knitr (>= 1.13), metagenomeSeq (>=
+ 1.14), rmarkdown (>= 0.9.6)
+VignetteBuilder: knitr
+Enhances: doParallel (>= 1.0.10)
+biocViews: Sequencing, Microbiome, Metagenomics, Clustering,
+ Classification, MultipleComparison, GeneticVariability
+URL: http://dx.plos.org/10.1371/journal.pone.0061217
+BugReports: https://github.com/joey711/phyloseq/issues
+Collate: 'allClasses.R' 'allPackage.R' 'allData.R' 'as-methods.R'
+ 'show-methods.R' 'plot-methods.R' 'extract-methods.R'
+ 'almostAllAccessors.R' 'otuTable-class.R' 'phyloseq-class.R'
+ 'taxonomyTable-class.R' 'IO-methods.R' 'merge-methods.R'
+ 'multtest-wrapper.R' 'ordination-methods.R'
+ 'transform_filter-methods.R' 'validity-methods.R'
+ 'assignment-methods.R' 'sampleData-class.R' 'extend_vegan.R'
+ 'network-methods.R' 'distance-methods.R'
+ 'deprecated_functions.R' 'extend_DESeq2.R' 'phylo-class.R'
+ 'extend_metagenomeSeq.R'
+RoxygenNote: 5.0.1
+NeedsCompilation: no
+Packaged: 2016-12-30 00:05:28 UTC; biocbuild
diff --git a/NAMESPACE b/NAMESPACE
new file mode 100644
index 0000000..713e340
--- /dev/null
+++ b/NAMESPACE
@@ -0,0 +1,272 @@
+# Generated by roxygen2: do not edit by hand
+
+export("otuTable<-")
+export("otu_table<-")
+export("phy_tree<-")
+export("sam_data<-")
+export("sampleData<-")
+export("sample_data<-")
+export("sample_names<-")
+export("speciesAreRows<-")
+export("taxTab<-")
+export("tax_table<-")
+export("taxa_are_rows<-")
+export("taxa_names<-")
+export("tre<-")
+export(DPCoA)
+export(UniFrac)
+export(access)
+export(build_tax_table)
+export(distance)
+export(distanceMethodList)
+export(estimate_richness)
+export(export_env_file)
+export(export_mothur_dist)
+export(filter_taxa)
+export(filterfunSample)
+export(filterfun_sample)
+export(gapstat_ord)
+export(genefilterSample)
+export(genefilter_sample)
+export(getSamples)
+export(getSpecies)
+export(getTaxa)
+export(getVariable)
+export(get_sample)
+export(get_taxa)
+export(get_taxa_unique)
+export(get_variable)
+export(getslots.phyloseq)
+export(import)
+export(import_RDP_cluster)
+export(import_RDP_otu)
+export(import_biom)
+export(import_env_file)
+export(import_mothur)
+export(import_mothur_dist)
+export(import_pyrotagger_tab)
+export(import_qiime)
+export(import_qiime_otu_tax)
+export(import_qiime_sampleData)
+export(import_qiime_sample_data)
+export(import_uparse)
+export(import_usearch_uc)
+export(make_network)
+export(merge_phyloseq)
+export(merge_phyloseq_pair)
+export(merge_samples)
+export(merge_species)
+export(merge_taxa)
+export(microbio_me_qiime)
+export(mt)
+export(nodeplotblank)
+export(nodeplotboot)
+export(nodeplotdefault)
+export(nsamples)
+export(nspecies)
+export(ntaxa)
+export(ordinate)
+export(otuTable)
+export(otu_table)
+export(parse_taxonomy_default)
+export(parse_taxonomy_greengenes)
+export(parse_taxonomy_qiime)
+export(phy_tree)
+export(phyloseq)
+export(phyloseq_to_deseq2)
+export(phyloseq_to_metagenomeSeq)
+export(plot_bar)
+export(plot_clusgap)
+export(plot_heatmap)
+export(plot_net)
+export(plot_network)
+export(plot_ordination)
+export(plot_phyloseq)
+export(plot_richness)
+export(plot_richness_estimates)
+export(plot_scree)
+export(plot_taxa_bar)
+export(plot_tree)
+export(prune_samples)
+export(prune_species)
+export(prune_taxa)
+export(psmelt)
+export(rank.names)
+export(rank_names)
+export(rarefy_even_depth)
+export(read_tree)
+export(read_tree_greengenes)
+export(refseq)
+export(rm_outlierf)
+export(samData)
+export(sam_data)
+export(sample.names)
+export(sample.variables)
+export(sampleData)
+export(sampleNames)
+export(sampleSums)
+export(sample_data)
+export(sample_names)
+export(sample_sums)
+export(sample_variables)
+export(show_mothur_cutoffs)
+export(show_mothur_list_cutoffs)
+export(species.names)
+export(speciesAreRows)
+export(speciesSums)
+export(speciesarerows)
+export(subset_ord_plot)
+export(subset_samples)
+export(subset_species)
+export(subset_taxa)
+export(t)
+export(taxTab)
+export(tax_glom)
+export(tax_table)
+export(taxa_are_rows)
+export(taxa_names)
+export(taxa_sums)
+export(taxaplot)
+export(taxglom)
+export(taxtab)
+export(threshrank)
+export(threshrankfun)
+export(tip_glom)
+export(tipglom)
+export(topf)
+export(topk)
+export(topp)
+export(transformSampleCounts)
+export(transform_sample_counts)
+export(tre)
+export(tree_layout)
+exportClasses(dist)
+exportClasses(otu_table)
+exportClasses(phylo)
+exportClasses(phyloseq)
+exportClasses(sample_data)
+exportClasses(taxonomyTable)
+exportMethods("[")
+exportMethods(show)
+import(BiocGenerics)
+import(foreach)
+import(methods)
+import(reshape2)
+importClassesFrom(Biostrings,AAStringSet)
+importClassesFrom(Biostrings,BStringSet)
+importClassesFrom(Biostrings,DNAStringSet)
+importClassesFrom(Biostrings,IlluminaQuality)
+importClassesFrom(Biostrings,PhredQuality)
+importClassesFrom(Biostrings,QualityScaledAAStringSet)
+importClassesFrom(Biostrings,QualityScaledBStringSet)
+importClassesFrom(Biostrings,QualityScaledDNAStringSet)
+importClassesFrom(Biostrings,QualityScaledRNAStringSet)
+importClassesFrom(Biostrings,QualityScaledXStringSet)
+importClassesFrom(Biostrings,RNAStringSet)
+importClassesFrom(Biostrings,SolexaQuality)
+importClassesFrom(Biostrings,XStringQuality)
+importClassesFrom(Biostrings,XStringSet)
+importFrom(Biobase,AnnotatedDataFrame)
+importFrom(Biostrings,readDNAStringSet)
+importFrom(ade4,cailliez)
+importFrom(ade4,dpcoa)
+importFrom(ade4,is.euclid)
+importFrom(ape,consensus)
+importFrom(ape,cophenetic.phylo)
+importFrom(ape,drop.tip)
+importFrom(ape,is.rooted)
+importFrom(ape,ladderize)
+importFrom(ape,node.depth)
+importFrom(ape,pcoa)
+importFrom(ape,prop.part)
+importFrom(ape,read.nexus)
+importFrom(ape,read.tree)
+importFrom(ape,reorder.phylo)
+importFrom(ape,root)
+importFrom(ape,write.nexus)
+importFrom(biomformat,biom_data)
+importFrom(biomformat,observation_metadata)
+importFrom(biomformat,read_biom)
+importFrom(biomformat,sample_metadata)
+importFrom(cluster,agnes)
+importFrom(cluster,clusGap)
+importFrom(cluster,pam)
+importFrom(data.table,copy)
+importFrom(data.table,data.table)
+importFrom(data.table,fread)
+importFrom(data.table,setkey)
+importFrom(data.table,setkeyv)
+importFrom(data.table,setnames)
+importFrom(ggplot2,aes)
+importFrom(ggplot2,aes_string)
+importFrom(ggplot2,element_blank)
+importFrom(ggplot2,element_text)
+importFrom(ggplot2,facet_grid)
+importFrom(ggplot2,facet_wrap)
+importFrom(ggplot2,geom_bar)
+importFrom(ggplot2,geom_errorbar)
+importFrom(ggplot2,geom_line)
+importFrom(ggplot2,geom_path)
+importFrom(ggplot2,geom_point)
+importFrom(ggplot2,geom_raster)
+importFrom(ggplot2,geom_segment)
+importFrom(ggplot2,geom_text)
+importFrom(ggplot2,ggplot)
+importFrom(ggplot2,ggtitle)
+importFrom(ggplot2,scale_alpha)
+importFrom(ggplot2,scale_fill_gradient)
+importFrom(ggplot2,scale_size)
+importFrom(ggplot2,scale_size_continuous)
+importFrom(ggplot2,scale_size_manual)
+importFrom(ggplot2,scale_x_continuous)
+importFrom(ggplot2,scale_x_discrete)
+importFrom(ggplot2,scale_y_discrete)
+importFrom(ggplot2,theme)
+importFrom(ggplot2,theme_bw)
+importFrom(ggplot2,update_labels)
+importFrom(ggplot2,xlab)
+importFrom(ggplot2,ylab)
+importFrom(igraph,V)
+importFrom(igraph,degree)
+importFrom(igraph,delete.vertices)
+importFrom(igraph,get.edgelist)
+importFrom(igraph,get.vertex.attribute)
+importFrom(igraph,graph.adjacency)
+importFrom(igraph,graph.data.frame)
+importFrom(igraph,layout.auto)
+importFrom(igraph,layout.circle)
+importFrom(igraph,layout.fruchterman.reingold)
+importFrom(igraph,layout.fruchterman.reingold.grid)
+importFrom(igraph,layout.graphopt)
+importFrom(igraph,layout.kamada.kawai)
+importFrom(igraph,layout.lgl)
+importFrom(igraph,layout.random)
+importFrom(igraph,layout.reingold.tilford)
+importFrom(igraph,layout.sphere)
+importFrom(igraph,layout.spring)
+importFrom(igraph,layout.svd)
+importFrom(igraph,vcount)
+importFrom(multtest,mt.maxT)
+importFrom(multtest,mt.minP)
+importFrom(plyr,ddply)
+importFrom(plyr,is.discrete)
+importFrom(plyr,ldply)
+importFrom(plyr,llply)
+importFrom(scales,log_trans)
+importFrom(stats,p.adjust)
+importFrom(stats,p.adjust.methods)
+importFrom(vegan,betadiver)
+importFrom(vegan,capscale)
+importFrom(vegan,cca)
+importFrom(vegan,decorana)
+importFrom(vegan,decostand)
+importFrom(vegan,designdist)
+importFrom(vegan,diversity)
+importFrom(vegan,estimateR)
+importFrom(vegan,fisher.alpha)
+importFrom(vegan,metaMDS)
+importFrom(vegan,rda)
+importFrom(vegan,scores)
+importFrom(vegan,vegdist)
+importFrom(vegan,wascores)
+importFrom(vegan,wisconsin)
diff --git a/R/IO-methods.R b/R/IO-methods.R
new file mode 100644
index 0000000..2b578c8
--- /dev/null
+++ b/R/IO-methods.R
@@ -0,0 +1,2431 @@
+################################################################################
+#' Universal import method (wrapper) for phyloseq-package
+#'
+#' A user must still understand the additional arguments required for each
+#' type of import data. Those arguments are described in detail at the
+#' tool-specific \code{import_*} links below. Each clustering tool / package / pipeline
+#' has its own idiosyncratic set of file names / types, and it remains the
+#' responsibility of the user to understand which file-path should be provided
+#' to each argument for the particular importing submethod. This method
+#' merely provides a central documentation and method-name, and the arguments
+#' are passed along as-is.
+#'
+#' @usage import(pipelineName, ...)
+#'
+#' @param pipelineName (Required). Character string. The name of the
+#' analysis tool / pipeline / package
+#' that created the OTU-cluster data or other data that you now want to import.
+#' Current options are \code{c("mothur", "pyrotagger", "QIIME", "RDP")}, and
+#' only the first letter is necessary.
+#'
+#' @param ... (Required). Additional named arguments providing file paths, and possible
+#' other paramaters to the desired tool-specific import function.
+#'
+#' @return In most cases a \code{\link{phyloseq-class}} will be returned, though
+#' the included component data will vary by pipeline/tool, and also
+#' by the types of data files provided.
+#' The expected behavior is to return the most-comprehensive object possible,
+#' given the provided arguments and pipeline/tool.
+#'
+#' @seealso
+#'
+#' For BIOM format, see:
+#' \code{\link{import_biom}}
+#'
+#' For mothur, see:
+#' \code{\link{import_mothur}}
+#'
+#' Separate tools for mothur are also:
+#' \code{\link{show_mothur_cutoffs}}
+#' \code{\link{import_mothur_dist}}
+#' \code{\link{export_mothur_dist}}
+#'
+#' For PyroTagger, see:
+#' \code{\link{import_pyrotagger_tab}}
+#'
+#' For QIIME legacy format, see:
+#' \code{\link{import_qiime}}
+#'
+#' For RDP pipeline, see:
+#' \code{\link{import_RDP_cluster}}
+#'
+#' \code{\link{import_RDP_otu}}
+#'
+#' @references
+#'
+#' BIOM: \url{http://www.biom-format.org/}
+#'
+#' mothur: \url{http://www.mothur.org/wiki/Main_Page}
+#'
+#' PyroTagger: \url{http://pyrotagger.jgi-psf.org/}
+#'
+#' QIIME: \url{http://qiime.org/}
+#'
+#' RDP pipeline: \url{http://pyro.cme.msu.edu/index.jsp}
+#'
+#' @export
+#' @examples
+#' ## See documentation of a specific import function
+import <- function(pipelineName, ...){
+ # Reduce pipelineName to just its first letter, as all are different
+ pipelineName <- substr(pipelineName, 1, 1)
+
+ # Test that it is in the set
+ if( !(pipelineName %in% c("B", "b", "M", "m", "P", "p", "Q", "q", "R", "r")) ){
+ stop("You need to select among available importer types:\n",
+ "\"BIOM\", \"mothur\", \"pyrotagger\", \"QIIME\", \"RDP\" \n See ?import for details")
+ }
+
+ if( pipelineName %in% c("B", "b") ){
+ return( import_biom(...) )
+ }
+ if( pipelineName %in% c("M", "m") ){
+ return( import_mothur(...) )
+ }
+ if( pipelineName %in% c("P", "p") ){
+ return( import_pyrotagger_tab(...) )
+ }
+ if( pipelineName %in% c("Q", "q") ){
+ return( import_qiime(...) )
+ }
+ if( pipelineName %in% c("R", "r") ){
+ return( import_RDP_cluster(...) )
+ }
+}
+################################################################################
+#' Import function to read the now legacy-format QIIME OTU table.
+#'
+#' QIIME produces several files that can be directly imported by
+#' the \code{\link{phyloseq-package}}.
+#' Originally, QIIME produced its own custom format table
+#' that contained both OTU-abundance
+#' and taxonomic identity information.
+#' This function is still included in phyloseq mainly to accommodate these
+#' now-outdated files. Recent versions of QIIME store output in the
+#' biom-format, an emerging file format standard for microbiome data.
+#' If your data is in the biom-format, if it ends with a \code{.biom}
+#' file name extension, then you should use the \code{\link{import_biom}}
+#' function instead.
+#'
+#' Other related files include
+#' the mapping-file that typically stores sample covariates,
+#' converted naturally to the
+#' \code{\link{sample_data-class}} component data type in the phyloseq-package.
+#' QIIME may also produce a
+#' phylogenetic tree with a tip for each OTU, which can also be imported
+#' specified here or imported separately using \code{\link{read_tree}}.
+#'
+#' See \url{"http://www.qiime.org/"} for details on using QIIME. While there are
+#' many complex dependencies, QIIME can be downloaded as a pre-installed
+#' linux virtual machine that runs ``off the shelf''.
+#'
+#' The different files useful for import to \emph{phyloseq} are not collocated in
+#' a typical run of the QIIME pipeline. See the main \emph{phyloseq} vignette for an
+#' example of where ot find the relevant files in the output directory.
+#'
+#' @param otufilename (Optional). A character string indicating
+#' the file location of the OTU file.
+#' The combined OTU abundance and taxonomic identification file,
+#' tab-delimited, as produced by QIIME under default output settings.
+#' Default value is \code{NULL}.
+#'
+#' @param mapfilename (Optional). The QIIME map file is required
+#' for processing barcoded primers in QIIME
+#' as well as some of the post-clustering analysis. This is a required
+#' input file for running QIIME. Its strict formatting specification should be
+#' followed for correct parsing by this function.
+#' Default value is \code{NULL}.
+#'
+#' @param treefilename (Optional). Default value is \code{NULL}.
+#' A file representing a phylogenetic tree
+#' or a \code{\link{phylo}} object.
+#' Files can be NEXUS or Newick format.
+#' See \code{\link{read_tree}} for more details.
+#' Also, if using a recent release of the GreenGenes database tree,
+#' try the \code{\link{read_tree_greengenes}} function --
+#' this should solve some issues specific to importing that tree.
+#' If provided, the tree should have the same OTUs/tip-labels
+#' as the OTUs in the other files.
+#' Any taxa or samples missing in one of the files is removed from all.
+#' As an example from the QIIME pipeline,
+#' this tree would be a tree of the representative 16S rRNA sequences from each OTU
+#' cluster, with the number of leaves/tips equal to the number of taxa/species/OTUs,
+#' or the complete reference database tree that contains the OTU identifiers
+#' of every OTU in your abundance table.
+#' Note that this argument can be a tree object (\code{\link[ape]{phylo}}-class)
+#' for cases where the tree has been --- or needs to be --- imported separately,
+#' as in the case of the GreenGenes tree mentioned earlier (code{\link{read_tree_greengenes}}).
+#'
+#' @param refseqfilename (Optional). Default \code{NULL}.
+#' The file path of the biological sequence file that contains at a minimum
+#' a sequence for each OTU in the dataset.
+#' Alternatively, you may provide an already-imported
+#' \code{\link[Biostrings]{XStringSet}} object that satisfies this condition.
+#' In either case, the \code{\link{names}} of each OTU need to match exactly the
+#' \code{\link{taxa_names}} of the other components of your data.
+#' If this is not the case, for example if the data file is a FASTA format but
+#' contains additional information after the OTU name in each sequence header,
+#' then some additional parsing is necessary,
+#' which you can either perform separately before calling this function,
+#' or describe explicitly in a custom function provided in the (next) argument,
+#' \code{refseqFunction}.
+#' Note that the \code{\link[Biostrings]{XStringSet}} class can represent any
+#' arbitrary sequence, including user-defined subclasses, but is most-often
+#' used to represent RNA, DNA, or amino acid sequences.
+#' The only constraint is that this special list of sequences
+#' has exactly one named element for each OTU in the dataset.
+#'
+#' @param refseqFunction (Optional).
+#' Default is \code{\link[Biostrings]{readDNAStringSet}},
+#' which expects to read a fasta-formatted DNA sequence file.
+#' If your reference sequences for each OTU are amino acid, RNA, or something else,
+#' then you will need to specify a different function here.
+#' This is the function used to read the file connection provided as the
+#' the previous argument, \code{refseqfilename}.
+#' This argument is ignored if \code{refseqfilename} is already a
+#' \code{\link[Biostrings]{XStringSet}} class.
+#'
+#' @param refseqArgs (Optional).
+#' Default \code{NULL}.
+#' Additional arguments to \code{refseqFunction}.
+#' See \code{\link[Biostrings]{XStringSet-io}} for details about
+#' additional arguments to the standard read functions in the Biostrings package.
+#'
+#' @param parseFunction (Optional). An optional custom function for parsing the
+#' character string that contains the taxonomic assignment of each OTU.
+#' The default parsing function is \code{\link{parse_taxonomy_qiime}},
+#' specialized for splitting the \code{";"}-delimited strings and also
+#' attempting to interpret greengenes prefixes, if any, as that is a common
+#' format of the taxonomy string produced by QIIME.
+#'
+#' @param verbose (Optional). A \code{\link{logical}}.
+#' Default is \code{TRUE}.
+#' Should progresss messages
+#' be \code{\link{cat}}ted to standard out?
+#'
+#' @param ... Additional arguments passed to \code{\link{read_tree}}
+#'
+#' @return A \code{\link{phyloseq-class}} object.
+#'
+#' @seealso
+#'
+#' \code{\link{phyloseq}}
+#'
+#' \code{\link{merge_phyloseq}}
+#'
+#' \code{\link{read_tree}}
+#'
+#' \code{\link{read_tree_greengenes}}
+#'
+#' \code{\link[Biostrings]{XStringSet-io}}
+#'
+#' @references \url{http://qiime.org/}
+#'
+#' ``QIIME allows analysis of high-throughput community sequencing data.''
+#' J Gregory Caporaso, Justin Kuczynski, Jesse Stombaugh, Kyle Bittinger, Frederic D Bushman,
+#' Elizabeth K Costello, Noah Fierer, Antonio Gonzalez Pena, Julia K Goodrich, Jeffrey I Gordon,
+#' Gavin A Huttley, Scott T Kelley, Dan Knights, Jeremy E Koenig, Ruth E Ley,
+#' Catherine A Lozupone, Daniel McDonald, Brian D Muegge, Meg Pirrung, Jens Reeder, Joel R Sevinsky,
+#' Peter J Turnbaugh, William A Walters, Jeremy Widmann, Tanya Yatsunenko, Jesse Zaneveld and Rob Knight;
+#' Nature Methods, 2010; doi:10.1038/nmeth.f.303
+#'
+#' @importClassesFrom Biostrings XStringSet
+#' @importFrom Biostrings readDNAStringSet
+#' @export
+#' @examples
+#' otufile <- system.file("extdata", "GP_otu_table_rand_short.txt.gz", package="phyloseq")
+#' mapfile <- system.file("extdata", "master_map.txt", package="phyloseq")
+#' trefile <- system.file("extdata", "GP_tree_rand_short.newick.gz", package="phyloseq")
+#' import_qiime(otufile, mapfile, trefile)
+import_qiime <- function(otufilename=NULL, mapfilename=NULL,
+ treefilename=NULL, refseqfilename=NULL,
+ refseqFunction=readDNAStringSet, refseqArgs=NULL,
+ parseFunction=parse_taxonomy_qiime, verbose=TRUE, ...){
+
+ # initialize the argument-list for phyloseq. Start empty.
+ argumentlist <- list()
+
+ if( !is.null(mapfilename) ){
+ if( verbose ){
+ cat("Processing map file...", fill=TRUE)
+ }
+ QiimeMap <- import_qiime_sample_data(mapfilename)
+ argumentlist <- c(argumentlist, list(QiimeMap))
+ }
+
+ if( !is.null(otufilename) ){
+ if( verbose ){
+ cat("Processing otu/tax file...", fill=TRUE)
+ }
+ otutax <- import_qiime_otu_tax(otufilename, parseFunction, verbose=verbose)
+ otutab <- otu_table(otutax$otutab, TRUE)
+ taxtab <- tax_table(otutax$taxtab)
+ argumentlist <- c(argumentlist, list(otutab), list(taxtab) )
+ }
+
+ if( !is.null(treefilename) ){
+ if(verbose){cat("Processing phylogenetic tree...\n", treefilename, "...\n")}
+ if(inherits(treefilename, "phylo")){
+ # If argument is already a tree, don't read, just assign.
+ tree = treefilename
+ } else {
+ # If it is not a tree, assume file and attempt to import.
+ # NULL is silently returned if tree is not read properly.
+ tree <- read_tree(treefilename, ...)
+ }
+ # Add to argument list or warn
+ if( is.null(tree) ){
+ warning("treefilename failed import. It will not be included.")
+ } else {
+ argumentlist <- c(argumentlist, list(tree) )
+ }
+ }
+
+ if( !is.null(refseqfilename) ){
+ if( verbose ){
+ cat("Processing Reference Sequences...", fill=TRUE)
+ }
+ if( inherits(refseqfilename, "XStringSet") ){
+ # If argument is already a XStringSet, don't read, just assign.
+ refseq = refseqfilename
+ } else {
+ # call refseqFunction and read refseqfilename,
+ # either with or without additional args
+ if( !is.null(refseqArgs) ){
+ refseq = do.call("refseqFunction", c(list(refseqfilename), refseqArgs))
+ } else {
+ refseq = refseqFunction(refseqfilename)
+ }
+ }
+ argumentlist <- c(argumentlist, list(refseq) )
+ }
+
+ do.call("phyloseq", argumentlist)
+}
+################################################################################
+#' Somewhat flexible tree-import function
+#'
+#' This function is a convenience wrapper around the
+#' \code{\link[ape]{read.tree}} (Newick-format) and
+#' \code{\link[ape]{read.nexus}} (Nexus-format) importers provided by
+#' the \code{\link[ape]{ape-package}}. This function attempts to return a valid
+#' tree if possible using either format importer. If it fails, it silently
+#' returns \code{NULL} by default, rather than throwing a show-stopping error.
+#'
+#' @usage read_tree(treefile, errorIfNULL=FALSE, ...)
+#'
+#' @param treefile (Required). A character string implying a file \code{\link{connection}}
+#' (like a path or URL), or an actual \code{\link{connection}}.
+#' Must be a Newick- or Nexus-formatted tree.
+#'
+#' @param errorIfNULL (Optional). Logical. Should an error be thrown if no tree
+#' can be extracted from the connection?
+#' Default is \code{FALSE}, indicating that \code{NULL} will be
+#' SILENTLY returned, rather than an error.
+#' Be cautious about this behavior. Useful for phyloseq internals, but might
+#' be hard to track in your own code if you're not aware of this
+#' ``no error by default'' setting. If this is a problem, change this value
+#' to \code{TRUE}, and you can still use the function.
+#'
+#' @param ... (Optional). Additional parameter(s) passed to the
+#' relevant tree-importing function.
+#'
+#' @return If successful, returns a \code{\link{phylo}}-class object as defined
+#' in the \code{\link[ape]{ape-package}}. Returns NULL if neither tree-reading function worked.
+#'
+#' @seealso
+#' \code{\link{read_tree_greengenes}}
+#'
+#' \code{\link{phylo}}
+#'
+#' \code{\link[ape]{read.tree}}
+#'
+#' \code{\link[ape]{read.nexus}}
+#'
+#' @importFrom ape read.nexus
+#' @importFrom ape read.tree
+#' @export
+#' @examples
+#' read_tree(system.file("extdata", "esophagus.tree.gz", package="phyloseq"))
+#' read_tree(system.file("extdata", "GP_tree_rand_short.newick.gz", package="phyloseq"))
+read_tree <- function(treefile, errorIfNULL=FALSE, ...){
+ # "phylo" object provided directly
+ if( class(treefile)[1] %in% c("phylo") ){
+ tree <- treefile
+ } else {
+ # file path to tree file provided.
+ # Try Nexus first, protected, then newick if it fails
+ tree <- NULL
+ try(tree <- read.nexus(treefile, ...), TRUE)
+ # Try Newick if nexus didn't work.
+ if(is.null(tree)) try(tree <- read.tree(treefile, ...), TRUE)
+ }
+ # If neither tree-import worked (still NULL), report warning
+ if( errorIfNULL & is.null(tree) ){
+ stop("tree file could not be read.\nPlease retry with valid tree.")
+ }
+ if( !is.null(tree) ){
+ # Perform any standard phyloseq checks/fixes
+ # E.g. Replace any NA branch-length values in the tree with zero.
+ tree = fix_phylo(tree)
+ }
+ return(tree)
+}
+################################################################################
+#' Read GreenGenes tree released in annotated newick format
+#'
+#' In principal, this is a standard newick format, that can be imported
+#' into R using \code{\link{read_tree}},
+#' which in-turn utilizes \code{\link[ape]{read.tree}}.
+#' However, \code{\link[ape]{read.tree}} has failed to import
+#' recent (October 2012 and later) releases of the GreenGenes tree,
+#' and this problem has been traced to the additional annotations
+#' added to some internal nodes
+#' that specify taxonomic classification between single-quotes.
+#' To solve this problem and create a clear container
+#' for fixing future problems with the format of GreenGenes-released trees,
+#' this function is available in phyloseq and exported for users.
+#' It is also referenced in the documentation of the import functions
+#' for QIIME legacy and BIOM format importers --
+#' \code{\link{import_qiime}} and \code{\link{import_biom}}, respectively.
+#' However, since the precise format of the tree is not restricted to GreenGenes trees
+#' by QIIME or for the biom-format, this function is not called
+#' automatically by those aforementioned import functions.
+#' If your tree is formatted like, or is one of, the official GreenGenes
+#' release trees, then you should use this function and provide its output
+#' to your relevant import function.
+#'
+#' @param treefile (Required). A character string implying
+#' a file \code{\link{connection}}
+#' (like a path or URL), or an actual \code{\link{connection}}.
+#' Must be a Newick--formatted tree released by GreenGenes
+#' in October 2012 or later.
+#' The similarity threshold of the OTUs should not matter,
+#' except that it should match your OTU table.
+#'
+#' @return A tree, represented as a \code{\link{phylo}} object.
+#'
+#' @importFrom ape read.tree
+#' @export
+#' @examples
+#' # Read the May 2013, 73% similarity official tree,
+#' # included as extra data in phyloseq.
+#' treefile = system.file("extdata", "gg13-5-73.tree.gz", package="phyloseq")
+#' x = read_tree_greengenes(treefile)
+#' x
+#' class(x)
+#' y = read_tree(treefile)
+#' y
+#' class(y)
+#' ## Not run, causes an error:
+#' # library("ape")
+#' # read.tree(treefile)
+read_tree_greengenes = function(treefile){
+ alines = readLines(treefile, warn=FALSE)
+ # Collapse to one line, in case it isn't already.
+ alines = paste0(alines, collapse="")
+ # replace all semicolons with something weird
+ # that isn't already a special newick character.
+ newdelim = "><-><"
+ clines = gsub("\\;", newdelim, alines)
+ # reinstate the final character as a semicolon
+ clines = gsub(paste0(newdelim, "$"), ";", clines)
+ # Convert your newick string into a phylo-class tree.
+ tree = read.tree("", text=clines)
+ # Now that it is phylo-class, reinstate semicolon
+ # as the delimiter in the node labels
+ gsub(newdelim, ";", tree$node.label)
+ # Also get rid of those extra quotes
+ gsub("'", "", tree$node.label)
+ # Return the cleaned-up tree
+ return(tree)
+}
+################################################################################
+#' Import now legacy-format QIIME OTU table as a list of two matrices.
+#'
+#' Now a legacy-format, older versions of QIIME
+#' produced an OTU file that typically contains both OTU-abundance
+#' and taxonomic identity information in a tab-delimted table.
+#' If your file ends with the extension \code{.biom}, or if you happen to know
+#' that it is a biom-format file, or if you used default settings in a version
+#' of QIIME of \code{1.7} or greater,
+#' then YOU SHOULD USE THE BIOM-IMPORT FUNCTION instead,
+#' \code{\link{import_biom}}.
+#'
+#' This function uses chunking to perform both the reading and parsing in blocks
+#' of optional size,
+#' thus constrain the peak memory usage.
+#' feature should make this
+#' importer accessible to machines with modest memory,
+#' but with the caveat that
+#' the full numeric matrix must be a manageable size at the end, too.
+#' In principle, the final tables will be large, but much more efficiently represented than
+#' the character-stored numbers.
+#' If total memory for storing the numeric matrix becomes problematic,
+#' a switch to a sparse matrix representation of the abundance
+#' -- which is typically well-suited to this data -- might provide a solution.
+#'
+#' @param file (Required). The path to the qiime-formatted file you want to
+#' import into R. Can be compressed (e.g. \code{.gz}, etc.), though the
+#' details may be OS-specific. That is, Windows-beware.
+#'
+#' @param parseFunction (Optional). An optional custom function for parsing the
+#' character string that contains the taxonomic assignment of each OTU.
+#' The default parsing function is \code{\link{parse_taxonomy_qiime}},
+#' specialized for splitting the \code{";"}-delimited strings and also
+#' attempting to interpret greengenes prefixes, if any, as that is a common
+#' format of the taxonomy string produced by QIIME.
+#'
+#' @param verbose (Optional). A \code{\link{logical}}.
+#' Default is \code{TRUE}.
+#' Should progresss messages
+#' be \code{\link{cat}}ted to standard out?
+#'
+#' @param parallel (Optional). Logical. Should the parsing be performed in
+#' parallel?. Default is \code{FALSE}. Only a few steps are actually
+#' parallelized, and for most datasets it will actually be faster and
+#' more efficient to keep this set to \code{FALSE}.
+#' Also, to get any benefit at all, you will need to register a
+#' parallel ``backend'' through one of the backend packages supported
+#' by the \code{\link{foreach-package}}.
+#'
+#' @return A list of two matrices. \code{$otutab} contains the OTU Table
+#' as a numeric matrix, while \code{$taxtab} contains a character matrix
+#' of the taxonomy assignments.
+#'
+#' @importFrom data.table fread
+#' @importFrom plyr llply
+#'
+#' @seealso
+#' \code{\link{import}}
+#'
+#' \code{\link{merge_phyloseq}}
+#'
+#' \code{\link{phyloseq}}
+#'
+#' \code{\link{import_qiime}}
+#'
+#' \code{\link{read_tree}}
+#'
+#' \code{\link{read_tree_greengenes}}
+#'
+#' \code{\link{import_env_file}}
+#'
+#' @export
+#' @examples
+#' otufile <- system.file("extdata", "GP_otu_table_rand_short.txt.gz", package="phyloseq")
+#' import_qiime_otu_tax(otufile)
+import_qiime_otu_tax <- function(file, parseFunction=parse_taxonomy_qiime,
+ verbose=TRUE, parallel=FALSE){
+ if(verbose){cat("Reading file into memory prior to parsing...\n")}
+ x = readLines(file)
+ if(verbose){cat("Detecting first header line...\n")}
+ # Check for commented lines, starting with line 1.
+ # The deepest commented line (largest n) is assumed to have header information.
+ skipLines = max(which(substr(x[1:25L], 1, 1)=="#"))-1L
+ if(verbose){cat("Header is on line", (skipLines + 1L), " \n")}
+ if(verbose){cat("Converting input file to a table...\n")}
+ x = fread(input=paste0(x, collapse="\n"), sep="\t", header=TRUE, skip=skipLines)
+ if(verbose){cat("Defining OTU table... \n")}
+ taxstring = x$`Consensus Lineage`
+ # This pops the taxonomy (Consensus Lineage) column, in-place statement
+ x[, `Consensus Lineage`:=NULL]
+ # Store the OTU names, you will pop the column
+ OTUnames = x$`#OTU ID`
+ # This pops the OTUID column, in-place statement
+ x[, `#OTU ID`:=NULL]
+ x <- as(x, "matrix")
+ rownames(x) <- OTUnames
+ rm(OTUnames)
+ if(verbose){cat("Parsing taxonomy table...\n")}
+ # Split into "jagged" list (vectors of different lengths)
+ taxlist = llply(taxstring, parseFunction, .parallel=parallel)
+ # Add OTU names to list element names
+ names(taxlist) <- rownames(x)
+ # Build the tax table from the jagged list.
+ taxtab <- build_tax_table(taxlist)
+ # Call garbage collection one more time. Lots of unneeded stuff.
+ garbage.collection <- gc(FALSE)
+ # Return the named list
+ return(list(otutab=x, taxtab=taxtab))
+}
+################################################################################
+################################################################################
+#' Import just \code{sample_data} file from QIIME pipeline.
+#'
+#' QIIME produces several files that can be analyzed in the phyloseq-package,
+#' This includes the map-file, which is an important \emph{input}
+#' to QIIME that can also indicate sample covariates. It is converted naturally to the
+#' sample_data component data type in phyloseq-package, based on the R data.frame.
+#'
+#' See \code{\link{import_qiime}} for more information about QIIME. It is also the
+#' suggested function for importing QIIME-produced data files.
+#'
+#' @usage import_qiime_sample_data(mapfilename)
+#'
+#' @param mapfilename (Required). A character string or connection.
+#' That is, any suitable \code{file} argument to the \code{\link{read.table}} function.
+#' The name of the QIIME map
+#' file required for processing pyrosequencing tags
+#' in QIIME as well as some of the post-clustering analysis. This is a required
+#' input file for running QIIME. Its strict formatting specification is expected by
+#' this function, do not attempt to modify it manually once it has worked properly
+#' in QIIME.
+#'
+#' @return A \code{sample_data} object.
+#'
+#' @seealso
+#'
+#' \code{\link{import}}
+#'
+#' \code{\link{merge_phyloseq}}
+#'
+#' \code{\link{phyloseq}}
+#'
+#' \code{\link{import_qiime}}
+#'
+#' \code{\link{import_qiime_otu_tax}}
+#'
+#' \code{\link{import_env_file}}
+#'
+#' @export
+#' @examples
+#' mapfile <- system.file("extdata", "master_map.txt", package = "phyloseq")
+#' import_qiime_sample_data(mapfile)
+import_qiime_sample_data <- function(mapfilename){
+ # Process mapfile. Name rows as samples.
+ QiimeMap <- read.table(file=mapfilename, header=TRUE,
+ sep="\t", comment.char="")
+ rownames(QiimeMap) <- as.character(QiimeMap[,1])
+ return( sample_data(QiimeMap) )
+}
+################################################################################
+#' Read a UniFrac-formatted ENV file.
+#'
+#' Convenience wrapper function to read the environment-file, as formatted for
+#' input to the UniFrac server (\url{http://bmf2.colorado.edu/unifrac/}).
+#' The official format of these files is that
+#' each row specifies (in order) the sequence name, source sample, and (optionally)
+#' the number of times the sequence was observed.
+#'
+#' @usage import_env_file(envfilename, tree=NULL, sep="\t", ...)
+#'
+#' @param envfilename (Required). A charater string of the ENV filename (relative or absolute)
+#'
+#' @param tree (Optional). \code{\link{phylo-class}} object to be paired with
+#' the output otu_table.
+#'
+#' @param sep A character string indicating the delimiter used in the file.
+#' The default is \code{"\t"}.
+#'
+#' @param ... Additional parameters passed on to \code{\link{read.table}}.
+#'
+#' @return An \code{\link{otu_table-class}}, or \code{\link{phyloseq-class}} if
+#' a \code{\link{phylo-class}} argument is provided to \code{tree}.
+#'
+#' @references \url{http://bmf2.colorado.edu/unifrac/}
+#'
+#' @seealso
+#' \code{\link{import}}
+#'
+#' \code{\link{tip_glom}}
+#' @export
+#' @examples
+#' # import_env_file(myEnvFile, myTree)
+import_env_file <- function(envfilename, tree=NULL, sep="\t", ...){
+ tipSampleTable <- read.table(envfilename, sep=sep, ...)
+ # Convert to otu_table-class table (trivial table)
+ physeq <- envHash2otu_table(tipSampleTable)
+ # If tree is provided, combine it with the OTU Table
+ if( class(tree) == "phylo" ){
+ # Create phyloseq-class with a tree and OTU Table (will perform any needed pruning)
+ physeq <- phyloseq(physeq, tree)
+ }
+ return(physeq)
+}
+################################################################################
+#' Convert a sequence-sample hash (like ENV file) into an OTU table.
+#'
+#' Parses an ENV-file into a sparse matrix of species-by-sample, where
+#' each species-row has only one non-zero value. We call this sparse abundance
+#' table the trivial OTU table, where every sequence is treated as a separate
+#' species. If a phylogenetic tree is available, it can be submitted with this
+#' table as arguments to \code{\link{tip_glom}} to create an object with a
+#' non-trivial \code{otu_table}.
+#'
+#' @usage envHash2otu_table(tipSampleTable)
+#'
+#' @param tipSampleTable (Required). A two-column character table (matrix or data.frame),
+#' where each row specifies the sequence name and source sample, consistent with the
+#' env-file for the UniFrac server (\url{http://bmf2.colorado.edu/unifrac/}).
+#'
+#' @return \code{\link{otu_table}}. A trivial OTU table where each sequence
+#' is treated as a separate OTU.
+#'
+#' @references \url{http://bmf2.colorado.edu/unifrac/}
+#'
+#' @seealso
+#' \code{\link{import_env_file}}
+#'
+#' \code{\link{tip_glom}}
+#'
+#' \code{\link{otu_table}}
+#'
+#' @keywords internal
+#' @examples #
+#' ## fakeSeqNameVec <- paste("seq_", 1:8, sep="")
+#' ## fakeSamNameVec <- c(rep("A", 4), rep("B", 4))
+#' ## fakeSeqAbunVec <- sample(1:50, 8, TRUE)
+#' ## test <- cbind(fakeSeqNameVec, fakeSamNameVec, fakeSeqAbunVec)
+#' ## testotu <- envHash2otu_table( test )
+#' ## test <- cbind(fakeSeqNameVec, fakeSamNameVec)
+#' ## testotu <- envHash2otu_table( test )
+envHash2otu_table <- function(tipSampleTable){
+ if( ncol(tipSampleTable) > 2 ){
+ tst <- tipSampleTable
+ trivialOTU <- matrix(0, nrow=nrow(tst), ncol=length(unique(tst[,2])))
+ colnames(trivialOTU) <- unique(tst[,2])
+ rownames(trivialOTU) <- tst[,1]
+ for( i in 1:nrow(tst) ){
+ trivialOTU[tst[i, 1], tst[i, 2]] <- as.integer(tst[i, 3])
+ }
+ } else {
+ trivialOTU <- table(as.data.frame(tipSampleTable))
+ trivialOTU <- as(trivialOTU, "matrix")
+ }
+ return( otu_table(trivialOTU, taxa_are_rows=TRUE) )
+}
+################################################################################
+################################################################################
+#' Import RDP cluster file and return otu_table (abundance table).
+#'
+#' The RDP cluster pipeline (specifically, the output of the complete linkage clustering step)
+#' has no formal documentation for the \code{".clust"}
+#' file or its apparent sequence naming convention.
+#'
+#' \code{http://pyro.cme.msu.edu/index.jsp}
+#'
+#' The cluster file itself contains
+#' the names of all sequences contained in input alignment. If the upstream
+#' barcode and aligment processing steps are also done with the RDP pipeline,
+#' then the sequence names follow a predictable naming convention wherein each
+#' sequence is named by its sample and sequence ID, separated by a \code{"_"} as
+#' delimiter:
+#'
+#' \code{"sampleName_sequenceIDnumber"}
+#'
+#' This import function assumes that the sequence names in the cluster file follow
+#' this convention, and that the sample name does not contain any \code{"_"}. It
+#' is unlikely to work if this is not the case. It is likely to work if you used
+#' the upstream steps in the RDP pipeline to process your raw (barcoded, untrimmed)
+#' fasta/fastq data.
+#'
+#' This function first loops through the \code{".clust"} file and collects all
+#' of the sample names that appear. It secondly loops through each OTU (\code{"cluster"};
+#' each row of the cluster file) and sums the number of sequences (reads) from
+#' each sample. The resulting abundance table of OTU-by-sample is trivially
+#' coerced to an \code{\link{otu_table}} object, and returned.
+#'
+#' @usage import_RDP_cluster(RDP_cluster_file)
+#'
+#' @param RDP_cluster_file A character string. The name of the \code{".clust"}
+#' file produced by the
+#' the complete linkage clustering step of the RDP pipeline.
+#'
+#' @return An \code{\link{otu_table}} object parsed from the \code{".clust"} file.
+#'
+#' @references \url{http://pyro.cme.msu.edu/index.jsp}
+#'
+#' @export
+#'
+import_RDP_cluster <- function(RDP_cluster_file){
+
+ # Read file and pop the header lines
+ RDP_raw_otu_lines_only <- readLines(RDP_cluster_file)[-(1:5)]
+
+ # internal function:
+ make_verbose_sample_list <- function(RDP_raw_otu_lines_only){
+ # Each OTU line has a 3 element "line header" that indicates the OTUID, the name of the file,
+ # and the number of sequences that are included in this cluster.
+ # From each line, remove the header elements
+ get_sample_names_from_one_line <- function(otuline){
+ # first split the line on tabs "\t"
+ splittabs <- strsplit(otuline, "\t")[[1]]
+
+ # next, remove the header by keeping on the 4th element.
+ seqIDonly <- splittabs[4]
+
+ # Finally, split on white space
+ seqIDonly <- strsplit(seqIDonly, "[[:space:]]+")[[1]]
+
+ # For each element in seqIDonly, split on the underscore delimiter
+ splitseqnames <- strsplit(seqIDonly, "_", fixed=TRUE)
+
+ # Return the sample names from the first element (assumes no "_" in sample names)
+ return( sapply(splitseqnames, function(i){i[1]}) )
+ }
+ return( sapply(RDP_raw_otu_lines_only, get_sample_names_from_one_line) )
+ }
+
+ ## Get the verbose sample name list, and then shrink to the
+ ## unique sample names in the entire dataset.
+ ## Need this unique list for initializing the OTU abundance matrix
+ RDPsamplenameslist <- make_verbose_sample_list(RDP_raw_otu_lines_only)
+ RDPsamplenames <- unique(unlist(RDPsamplenameslist))
+
+ # remove NAs
+ RDPsamplenames <- RDPsamplenames[!is.na(RDPsamplenames)]
+
+ # Initialize otu abundance matrix.
+ otumat <- matrix(0, nrow=length(RDP_raw_otu_lines_only), ncol=length(RDPsamplenames))
+ rownames(otumat) <- paste("OTUID_", 1:length(RDP_raw_otu_lines_only))
+ colnames(otumat) <- RDPsamplenames
+
+ # Now re-loop through the cluster file (by OTU) and sum the
+ # abundance of sequences from each sample
+ for( i in 1:length(RDP_raw_otu_lines_only) ){
+ # i = 1
+
+ # first split the line on tabs "\t"
+ splittabs <- strsplit(RDP_raw_otu_lines_only[i], "\t")[[1]]
+
+ # next, remove the header by keeping on the 4th element.
+ seqIDonly <- splittabs[4]
+
+ # Finally, split on white space
+ seqIDonly <- strsplit(seqIDonly, "[[:space:]]+")[[1]]
+
+ # For each element in seqIDonly, split on the underscore delimiter
+ splitseqnames <- strsplit(seqIDonly, "_", fixed=TRUE)
+
+ # make the verbose vector
+ verbosesamplenamesi <- sapply(splitseqnames, function(i){i[1]})
+
+ # sum the reads from each sample with tapply
+ OTUi <- tapply(verbosesamplenamesi, factor(verbosesamplenamesi), length)
+
+ # store results of this OTU in abundance matrix
+ otumat[i, names(OTUi)] <- OTUi
+ }
+
+ # Return the abundance table.
+ return( otu_table(otumat, taxa_are_rows=TRUE) )
+}
+################################################################################
+#' Import new RDP OTU-table format
+#'
+#' Recently updated tools on RDP Pyro site make it easier to import Pyrosequencing output
+#' into R. The modified tool ``Cluster To R Formatter'' can take a cluster file
+#' (generated from RDP Clustering tools) to create a community data matrix file
+#' for distance cutoff range you are interested in. The resulting output file
+#' is a tab-delimited file containing the number of sequences for each sample
+#' for each OTU. The OTU header naming convention is \code{"OTU_"} followed by the OTU
+#' number in the cluster file. It pads ``0''s to make the OTU header easy to sort.
+#' The OTU numbers are not necessarily in order.
+#'
+#' @usage import_RDP_otu(otufile)
+#'
+#' @param otufile (Optional).
+#' A character string indicating the file location of the OTU file,
+#' produced/exported according to the instructions above.
+#'
+#' @return A \code{\link{otu_table-class}} object.
+#'
+#' @seealso
+#' An alternative ``cluster'' file importer for RDP results:
+#' \code{\link{import_RDP_cluster}}
+#'
+#' The main RDP-pyrosequencing website
+#' \url{http://pyro.cme.msu.edu/index.jsp}
+#'
+#' @export
+#' @examples
+#' otufile <- system.file("extdata", "rformat_dist_0.03.txt.gz", package="phyloseq")
+#' ### the gzipped file is automatically recognized, and read using R-connections
+#' ex_otu <- import_RDP_otu(otufile)
+#' class(ex_otu)
+#' ntaxa(ex_otu)
+#' nsamples(ex_otu)
+#' sample_sums(ex_otu)
+#' head(t(ex_otu))
+import_RDP_otu <- function(otufile){
+ otumat <- read.table(otufile, TRUE, sep="\t", row.names=1)
+ return(otu_table(otumat, FALSE))
+}
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+#' Imports a tab-delimited version of the pyrotagger output file.
+#'
+#' PyroTagger is a web-server that takes raw, barcoded 16S rRNA amplicon sequences
+#' and returns an excel spreadsheet (\code{".xls"}) with both abundance and
+#' taxonomy data. It also includes some confidence information related to the
+#' taxonomic assignment.
+#'
+#' PyroTagger is created and maintained by the Joint Genome Institute
+#' at \code{"http://pyrotagger.jgi-psf.org/"}
+#'
+#' The typical output form PyroTagger is a spreadsheet format \code{".xls"}, which poses
+#' additional import challenges. However, virtually all spreadsheet applications
+#' support the \code{".xls"} format, and can further export this file in a
+#' tab-delimited format. It is recommended that you convert the xls-file without
+#' any modification (as tempting as it might be once you have loaded it) into a
+#' tab-delimited text file. Deselect any options to encapsulate fields in quotes,
+#' as extra quotes around each cell's contents might cause problems during
+#' file processing. These quotes will also inflate the file-size, so leave them out
+#' as much as possible, while also resisting any temptation to modify the xls-file
+#' ``by hand''.
+#'
+#' A highly-functional and free spreadsheet application can be obtained as part
+#' of the cross-platform \code{OpenOffice} suite. It works for the above
+#' required conversion. Go to \code{"http://www.openoffice.org/"}.
+#'
+#' It is regrettable that this importer does not take the xls-file directly
+#' as input. However, because of the moving-target nature of spreadsheet
+#' file formats, there is limited support for direct import of these formats into
+#' \code{R}. Rather than add to the dependency requirements of emph{phyloseq}
+#' and the relative support of these xls-support packages, it seems more efficient
+#' to choose an arbitrary delimited text format, and focus on the data
+#' structure in the PyroTagger output. This will be easier to support in the
+#' long-run.
+#'
+#' @usage import_pyrotagger_tab(pyrotagger_tab_file,
+#' strict_taxonomy=FALSE, keep_potential_chimeras=FALSE)
+#'
+#' @param pyrotagger_tab_file (Required). A character string. The name of the tab-delimited
+#' pyrotagger output table.
+#'
+#' @param strict_taxonomy (Optional). Logical. Default \code{FALSE}. Should the taxonomyTable
+#' component be limited to just taxonomic data? Default includes all fields from
+#' the pyrotagger file.
+#'
+#' @param keep_potential_chimeras (Optional). Logical. Default \code{FALSE}. The
+#' pyrotagger output also includes OTUs that are tagged by pyrotagger as likely
+#' chimeras. These putative chimeric OTUs can be retained if set to \code{TRUE}.
+#' The putative chimeras are excluded by default.
+#'
+#' @return An \code{otuTax} object containing both the otu_table and TaxonomyTable data
+#' components, parsed from the pyrotagger output.
+#'
+#' @export
+#'
+#' @references \url{http://pyrotagger.jgi-psf.org/}
+#'
+#' @examples
+#' ## New_otuTaxObject <- import_pyrotagger_tab(pyrotagger_tab_file)
+import_pyrotagger_tab <- function(pyrotagger_tab_file,
+ strict_taxonomy=FALSE, keep_potential_chimeras=FALSE){
+
+ x <- readLines(pyrotagger_tab_file, warn=FALSE)
+ # Get the header
+ pyro_header <- strsplit(x[1], "\t", TRUE)[[1]]
+ # Pop the first (header) line from the list.
+ x <- x[-1]
+
+ ########################################
+ ### There are "Potential chimeras"
+ ### listed in the typical output, separated by 2 completely blank lines
+ ### after the last confidently-good OTU.
+ ########################################
+ chimera_line <- grep("Potential chimeras", x, fixed=TRUE)
+ if( keep_potential_chimeras ){
+ # Pop just the blank lines that delimit the chimeras
+ # at the bottom of the table
+ x <- x[-((chimera_line-2):chimera_line)]
+ } else {
+ x <- x[-((chimera_line-2):length(x))]
+ }
+
+ ########################################
+ # The tab-split character list, z
+ ########################################
+ z <- strsplit(x, "\t", TRUE)
+ names(z) <- sapply(z, function(z){z[1]})
+
+ # The table switches from abundance to taxonomy at the "% Identity" column
+ taxonomy_table_column_index <- which( pyro_header == "% identity" )
+
+ ########################################
+ # Initialize the two matrices
+ # (otu_table and taxonomyTable)
+ ########################################
+ ### Initialize abundance matrix, a
+ a <- matrix(0, nrow=length(x), ncol=(taxonomy_table_column_index-2))
+ colnames(a) <- pyro_header[2:(taxonomy_table_column_index-1)]
+ rownames(a) <- names(z)
+
+ ###### Initialize the raw pyrotagger taxonomy matrix, w
+ ntax_tablecols <- (max(sapply(z, length)) - taxonomy_table_column_index + 1)
+ w <- matrix("", nrow=length(x), ncol=ntax_tablecols)
+ rownames(w) <- names(z)
+ colnamesw <- pyro_header[-(1:(taxonomy_table_column_index-1))]
+ colnamesw <- colnamesw[1:which(colnamesw=="Taxonomy")]
+ colnamesw <- c(colnamesw, paste("col", (which(colnamesw=="Taxonomy")+1):ntax_tablecols, sep="") )
+ colnames(w) <- colnamesw
+
+ # Rename the taxonomy columns
+ biotaxonomy <- c("Domain", "Phylum", "Class", "Order",
+ "Family", "Genus", "Species", "Strain")
+ colnames(w)[which(colnames(w)=="Taxonomy"):length(colnames(w))][1:length(biotaxonomy)] <- biotaxonomy
+
+ # Loop through each line and add to appropriate matrix.
+ for( i in rownames(a) ){
+ # i <- rownames(a)[[1]]
+ # cut out just the abundance part, and convert to integer
+ y <- as.integer(z[[i]][2:(taxonomy_table_column_index-1)])
+ y[is.na(y)] <- 0
+ a[i, ] <- y
+
+ # Taxonomy data is jagged
+ taxi <- z[[i]][-(1:(taxonomy_table_column_index-1))]
+ w[i, 1:length(taxi)] <- taxi
+ }
+
+ # Create the component objects
+ OTU <- otu_table(a, taxa_are_rows=TRUE)
+ if( strict_taxonomy ){
+ TAX <- tax_table[, biotaxonomy]
+ } else {
+ TAX <- tax_table(w)
+ }
+
+ return( phyloseq(OTU, TAX) )
+
+}
+################################################################################
+################################################################################
+#' Show cutoff values available in a mothur file.
+#'
+#' This is a helper function to report back to the user the different cutoff
+#' values available in a given mothur file --
+#' for instance, a list or shared file.
+#'
+#' @param mothur_list_file The file name and/or location as produced by \emph{mothur}.
+#'
+#' @return A character vector of the different cutoff values contained in the file.
+#' For a given set of arguments to the \code{cluster()} command from within
+#' \emph{mothur}, a number of OTU-clustering results are returned in the same
+#' file. The exact cutoff values used by \emph{mothur} can vary depending
+#' on the input data/parameters. This simple function returns the cutoffs that were actually
+#' included in the \emph{mothur} output. This an important extra step prior to
+#' importing data with the \code{\link{import_mothur}} function.
+#'
+#' @export
+#'
+#' @seealso \code{\link{import_mothur}}
+#'
+show_mothur_cutoffs <- function(mothur_list_file){
+ unique(scan(mothur_list_file, "character", comment.char="\t", quiet=TRUE))
+}
+################################################################################
+#' Import mothur list file and return as list object in R.
+#'
+#' This is a user-available module of a more comprehensive function for importing
+#' OTU clustering/abundance data using the \emph{mothur} package. The list object
+#' returned by this function is not immediately useable by other \emph{phyloseq}
+#' functions, and must be first parsed in conjunction with a separate \emph{mothur}
+#' \code{"group"} file. This function is made accessible to \emph{phyloseq} users
+#' for troubleshooting and inspection, but the \code{link{import_mothur()}} function
+#' is suggested if the goal is to import the OTU clustering results from \emph{mothur}
+#' into \emph{phyloseq}.
+#'
+#' @usage import_mothur_otulist(mothur_list_file, cutoff=NULL)
+#'
+#' @param mothur_list_file The list file name and/or location as produced by \emph{mothur}.
+#'
+#' @param cutoff A character string indicating the cutoff value, (or \code{"unique"}),
+#' that matches one of the cutoff-values used to produce the OTU clustering
+#' results contained within the list-file created by \emph{mothur}. The default
+#' is to take the largest value among the cutoff values contained in the list
+#' file. If only one cutoff is included in the file, it is taken and this
+#' argument does not need to be specified. Note that the \code{cluster()}
+#' function within the \emph{mothur} package will often produce a list file
+#' with multiple cutoff values, even if a specific cutoff is specified. It is
+#' suggested that you check which cutoff values are available in a given list
+#' file using the \code{\link{show_mothur_cutoffs}} function.
+#'
+#' @return A list, where each element is a character vector of 1 or more
+#' sequence identifiers, indicating how each sequence from the original data
+#' is clustered into OTUs by \emph{mothur}. Note that in some cases this is highly
+#' dependent on the choice for \code{cutoff}.
+#'
+#' @seealso \code{\link{show_mothur_cutoffs}}, \code{\link{import_mothur}}
+#' @keywords internal
+#'
+import_mothur_otulist <- function(mothur_list_file, cutoff=NULL){
+ # mothur_list_file = system.file("extdata", "esophagus.fn.list.gz", package="phyloseq")
+ # cutoff = 0.04
+ cutoffs = show_mothur_cutoffs(mothur_list_file)
+ cutoff = select_mothur_cutoff(cutoff, cutoffs)
+ # Read only the line corresponding to that cutoff
+ inputline = which(cutoffs == cutoff)
+ rawlines = scan(mothur_list_file, "character", sep="\t", skip=(inputline-1), nlines=1, na.strings="", quiet=TRUE)
+ rawlines = rawlines[!is.na(rawlines)]
+ # The first two elements are the cutoff and the number of OTUs. skip, and read to first comma for OTUnames
+ OTUnames = scan(text=rawlines, what="character", comment.char=",", quiet=TRUE)[3:as.integer(rawlines[2])]
+ # split each element on commas
+ OTUs <- strsplit(rawlines[3:as.integer(rawlines[2])], ",", fixed=TRUE)
+ # Name each OTU (currently as the first seq name in each cluster), and return the list
+ names(OTUs) <- OTUnames
+ # return as-is
+ return(OTUs)
+}
+################################################################################
+# Need to select a cutoff if none was provided by user.
+# Take the largest non-"unique" cutoff possible,
+# if "unique" is the only cutoff included in the list file, use that.
+# Multiple cutoffs are provided in both `.shared` and `.list` files.
+# This function consolidates the heuristic for selecting/checking a specified cutoff.
+#' @keywords internal
+select_mothur_cutoff = function(cutoff, cutoffs){
+ if( is.null(cutoff) ){
+ # cutoff was NULL, need to select one.
+ if( length(cutoffs) > 1 ){
+ # Select the largest value, avoiding the "unique" option.
+ selectCutoffs <- as(cutoffs[cutoffs != "unique"], "numeric")
+ cutoff <- as.character(max(selectCutoffs))
+ } else {
+ # There is only one cutoff value, so use it.
+ # Don't have to specify a cutoff, in this case
+ cutoff <- cutoffs
+ }
+ } else {
+ # Provided by user, non-null. Coerce to character for indexing
+ cutoff <- as.character(cutoff)
+ # Check that it is in set of available cutoffs.
+ if( !cutoff %in% cutoffs ){
+ stop("The cutoff value you provided is not among those available. Try show_mothur_cutoffs()")
+ }
+ }
+}
+################################################################################
+#' Parse mothur group file into a simple hash table.
+#'
+#' The data.frame object
+#' returned by this function is not immediately useable by other \emph{phyloseq}
+#' functions, and must be first parsed in conjunction with a separate \emph{mothur}
+#' \code{"list"} file. This function is made accessible to \emph{phyloseq} users
+#' for troubleshooting and inspection, but the \code{link{import_mothur()}} function
+#' is suggested if the goal is to import the OTU clustering results from \emph{mothur}
+#' into \emph{phyloseq}. You will need both a group file and a list file for that end.
+#'
+#' @usage import_mothur_groups(mothur_group_file)
+#'
+#' @param mothur_group_file A character string indicating the location of the
+#' \emph{mothur}-produced group file in which the sample-source of each sequence
+#' is recorded. See
+#' \code{http://www.mothur.org/wiki/Make.group}
+#'
+#' @return A data.frame that is effectively a hash table between sequence names
+#' and their sample source.
+#'
+#' @seealso \code{\link{import_mothur}}
+#' @keywords internal
+#'
+import_mothur_groups <- function(mothur_group_file){
+ read.table(mothur_group_file, sep="\t", as.is=TRUE, stringsAsFactors=FALSE, colClasses="character", row.names=1)
+}
+################################################################################
+#' Import mothur list and group files and return an otu_table
+#'
+#' @usage import_mothur_otu_table(mothur_list_file, mothur_group_file, cutoff=NULL)
+#'
+#' @param mothur_list_file The list file name and/or location as produced by \emph{mothur}.
+#'
+#' @param mothur_group_file The name/location of the group file produced
+#' by \emph{mothur}'s \code{make.group()} function. It contains information
+#' about the sample source of individual sequences, necessary for creating a
+#' species/taxa abundance table (\code{otu_table}). See
+#' \code{http://www.mothur.org/wiki/Make.group}
+#'
+#' @param cutoff A character string indicating the cutoff value, (or \code{"unique"}),
+#' that matches one of the cutoff-values used to produce the OTU clustering
+#' results contained within the list-file created by \emph{mothur} (and specified
+#' by the \code{mothur_list_file} argument).
+#' The default
+#' is to take the largest value among the cutoff values contained in the list
+#' file. If only one cutoff is included in the file, it is taken and this
+#' argument does not need to be specified. Note that the \code{cluster()}
+#' function within the \emph{mothur} package will often produce a list file
+#' with multiple cutoff values, even if a specific cutoff is specified. It is
+#' suggested that you check which cutoff values are available in a given list
+#' file using the \code{\link{show_mothur_cutoffs}} function.
+#'
+#' @return An \code{\link{otu_table}} object.
+#'
+#' @seealso \code{\link{import_mothur}}
+#' @keywords internal
+#' @importFrom plyr ldply
+#' @importFrom plyr ddply
+import_mothur_otu_table <- function(mothur_list_file, mothur_group_file, cutoff=NULL){
+ otulist <- import_mothur_otulist(mothur_list_file, cutoff)
+ mothur_groups <- import_mothur_groups(mothur_group_file)
+ # Initialize abundance matrix with zeros for sparse assignment
+ samplenames = unique(mothur_groups[, 1])
+ mothur_otu_table <- matrix(0, nrow=length(otulist), ncol=length(samplenames))
+ colnames(mothur_otu_table) <- samplenames
+ rownames(mothur_otu_table) <- names(otulist)
+
+ # Write a sparse versino of the abundance table
+ df = ldply(otulist, function(x){data.frame(read=x, stringsAsFactors=FALSE)})
+ colnames(df)[1] <- "OTU"
+ df = data.frame(df, sample=mothur_groups[df[, "read"], 1], stringsAsFactors=FALSE)
+ adf = ddply(df, c("OTU", "sample"), function(x){
+ # x = subset(df, OTU=="59_3_17" & sample=="C")
+ data.frame(x[1, c("OTU", "sample"), drop=FALSE], abundance=nrow(x))
+ })
+
+ # Vectorized for speed using matrix indexing.
+ # See help("Extract") for details about matrix indexing. Diff than 2-vec index.
+ mothur_otu_table[as(adf[, c("OTU", "sample")], "matrix")] <- adf[, "abundance"]
+
+ # Finally, return the otu_table as a phyloseq otu_table object.
+ return(otu_table(mothur_otu_table, taxa_are_rows=TRUE))
+}
+################################################################################
+#' Import mothur shared file and return an otu_table
+#'
+#' @param mothur_shared_file (Required). A
+#' \href{http://www.mothur.org/wiki/Shared_file}{shared file}
+#' produced by \emph{mothur}.
+#'
+#' @return An \code{\link{otu_table}} object.
+#'
+#' @seealso \code{\link{import_mothur}}
+#' @keywords internal
+import_mothur_shared = function(mothur_shared_file, cutoff=NULL){
+ #mothur_shared_file = "~/github/phyloseq/inst/extdata/esophagus.fn.shared.gz"
+ # Check that cutoff is in cutoffs, or select a cutoff if none given.
+ cutoffs = show_mothur_cutoffs(mothur_shared_file)
+ cutoffs = cutoffs[!cutoffs %in% "label"]
+ cutoff = select_mothur_cutoff(cutoff, cutoffs)
+ x = readLines(mothur_shared_file)
+ rawtab = read.table(text=x[grep(paste0("^", cutoff), x)], header=FALSE, row.names=2, stringsAsFactors=FALSE)[, -(1:2)]
+ colnames(rawtab) <- strsplit(x[1], "\t")[[1]][4:(ncol(rawtab)+3)]
+ return(otu_table(t(as.matrix(rawtab)), taxa_are_rows=TRUE))
+}
+################################################################################
+#' Import mothur constaxonomy file and return a taxonomyTable
+#'
+#' @param mothur_constaxonomy_file (Required). A
+#' \href{http://www.mothur.org/wiki/Constaxonomy_file}{consensus taxonomy file}
+#' produced by \emph{mothur}.
+#'
+#' @param parseFunction (Optional). A specific function used for parsing the taxonomy string.
+#' See \code{\link{parse_taxonomy_default}} for an example. If the default is
+#' used, this function expects a semi-colon delimited taxonomy string, with
+#' no additional rank specifier. A common taxonomic database is GreenGenes,
+#' and for recent versions its taxonomy includes a prefix, which is best cleaved
+#' and used to precisely label the ranks (\code{\link{parse_taxonomy_greengenes}}).
+#'
+#' @return An \code{\link{taxonomyTable-class}} object.
+#'
+#' @seealso \code{\link{import_mothur}}
+#'
+#' \code{\link{tax_table}}
+#'
+#' \code{\link{phyloseq}}
+#'
+#' @keywords internal
+import_mothur_constaxonomy = function(mothur_constaxonomy_file, parseFunction=parse_taxonomy_default){
+ read.table(mothur_constaxonomy_file)
+ rawtab = read.table(mothur_constaxonomy_file, header=TRUE, row.names=1, stringsAsFactors=FALSE)[, "Taxonomy", drop=FALSE]
+ if( identical(parseFunction, parse_taxonomy_default) ){
+ # Proceed with default parsing stuff.
+ # Remove the confidence strings inside the parentheses, if present
+ rawtab[, "Taxonomy"] = gsub("\\([[:digit:]]+\\)", "", rawtab[, "Taxonomy"])
+ # Remove the quotation marks, if present
+ rawtab[, "Taxonomy"] = gsub("\"", "", rawtab[, "Taxonomy"])
+ # Remove trailing semicolon
+ rawtab[, "Taxonomy"] = gsub(";$", "", rawtab[, "Taxonomy"])
+ # Split on semicolon
+ taxlist = strsplit(rawtab[, "Taxonomy"], ";", fixed=TRUE)
+ taxlist = lapply(taxlist, parseFunction)
+ } else {
+ taxlist = lapply(rawtab[, "Taxonomy"], parseFunction)
+ }
+ names(taxlist) <- rownames(rawtab)
+ return(build_tax_table(taxlist))
+}
+################################################################################
+#' General function for importing mothur data files into phyloseq.
+#'
+#' Technically all parameters are optional,
+#' but if you don't provide any file connections, then nothing will be returned.
+#' While the \code{list} and \code{group} files are the first two arguments
+#' for legacy-compatibility reasons, we don't recommend that you use these
+#' file types with modern (large) datasets. They are comically inefficient, as
+#' they store the name of every sequencing read in both files. The \emph{mothur}
+#' package provides conversions utilities to create other more-efficient formats,
+#' which we recommend, like
+#' the \href{http://www.mothur.org/wiki/Shared_file}{shared file} for an OTU table.
+#' Alternatively, mothur also provides a utility to create a biom-format file
+#' that is independent of OTU clustering platform. Biom-format files
+#' should be imported not with this function, but with \code{\link{import_biom}}.
+#' The resulting objects after import should be \code{\link{identical}} in R.
+#'
+#' @param mothur_list_file (Optional). The list file name / location produced by \emph{mothur}.
+#'
+#' @param mothur_group_file (Optional). The name/location of the group file produced
+#' by \emph{mothur}'s \code{make.group()} function. It contains information
+#' about the sample source of individual sequences, necessary for creating a
+#' species/taxa abundance table (\code{otu_table}). See
+#' \code{http://www.mothur.org/wiki/Make.group}
+#'
+#' @param mothur_tree_file (Optional).
+#' A tree file, presumably produced by \emph{mothur},
+#' and readable by \code{\link{read_tree}}.
+#' The file probably has extension \code{".tree"}.
+#'
+#' @param cutoff (Optional). A character string indicating the cutoff value, (or \code{"unique"}),
+#' that matches one of the cutoff-values used to produce the OTU clustering
+#' results contained within the list-file created by \emph{mothur} (and specified
+#' by the \code{mothur_list_file} argument). The default
+#' is to take the largest value among the cutoff values contained in the list
+#' file. If only one cutoff is included in the file, it is taken and this
+#' argument does not need to be specified. Note that the \code{cluster()}
+#' function within the \emph{mothur} package will often produce a list file
+#' with multiple cutoff values, even if a specific cutoff is specified. It is
+#' suggested that you check which cutoff values are available in a given list
+#' file using the \code{\link{show_mothur_cutoffs}} function.
+#'
+#' @param mothur_shared_file (Optional). A
+#' \href{http://www.mothur.org/wiki/Shared_file}{shared file}
+#' produced by \emph{mothur}.
+#'
+#' @param mothur_constaxonomy_file (Optional). A
+#' \href{http://www.mothur.org/wiki/Constaxonomy_file}{consensus taxonomy file}
+#' produced by \emph{mothur}.
+#'
+#' @param parseFunction (Optional). A specific function used for parsing the taxonomy string.
+#' See \code{\link{parse_taxonomy_default}} for an example. If the default is
+#' used, this function expects a semi-colon delimited taxonomy string, with
+#' no additional rank specifier. A common taxonomic database is GreenGenes,
+#' and in recent versions its taxonomy entries include a prefix, which is best cleaved
+#' and used to precisely label the ranks (\code{\link{parse_taxonomy_greengenes}}).
+#'
+#' @return The object class depends on the provided arguments.
+#' A phyloseq object is returned if enough data types are provided.
+#' If only one data component can be created from the data, it is returned.
+#'
+#' FASTER (recommended for larger data sizes):
+#'
+#' If only a \code{mothur_constaxonomy_file} is provided,
+#' then a \code{\link{taxonomyTable-class}} object is returned.
+#'
+#' If only a \code{mothur_shared_file} is provided,
+#' then an \code{\link{otu_table}} object is returned.
+#'
+#' SLOWER (but fine for small file sizes):
+#'
+#' The list and group file formats are extremely inefficient for large datasets,
+#' and they are not recommended. The mothur software provides tools for
+#' converting to other file formats, such as a so-called ``shared'' file.
+#' You should provide a shared file, or group/list files, but not
+#' both at the same time.
+#' If only a list and group file are provided,
+#' then an \code{otu_table} object is returned.
+#' Similarly, if only a list and tree file are provided,
+#' then only a tree is returned (\code{\link[ape]{phylo}}-class).
+#'
+#' @references \url{http://www.mothur.org/wiki/Main_Page}
+#'
+#' Schloss, P.D., et al., Introducing mothur: Open-source, platform-independent,
+#' community-supported software for describing and comparing microbial communities.
+#' Appl Environ Microbiol, 2009. 75(23):7537-41.
+#'
+#' @export
+#'
+#' @examples
+#' # # The following example assumes you have downloaded the esophagus example
+#' # # dataset from the mothur wiki:
+#' # # "http://www.mothur.org/wiki/Esophageal_community_analysis"
+#' # # "http://www.mothur.org/w/images/5/55/Esophagus.zip"
+#' # # The path on your machine may (probably will) vary
+#' # mothur_list_file <- "~/Downloads/mothur/Esophagus/esophagus.an.list"
+#' # mothur_group_file <- "~/Downloads/mothur/Esophagus/esophagus.good.groups"
+#' # mothur_tree_file <- "~/Downloads/mothur/Esophagus/esophagus.tree"
+#' # # # Actual examples follow:
+#' # show_mothur_cutoffs(mothur_list_file)
+#' # test1 <- import_mothur(mothur_list_file, mothur_group_file, mothur_tree_file)
+#' # test2 <- import_mothur(mothur_list_file, mothur_group_file, mothur_tree_file, cutoff="0.02")
+#' # # Returns just a tree
+#' # import_mothur(mothur_list_file, mothur_tree_file=mothur_tree_file)
+#' # # Returns just an otu_table
+#' # import_mothur(mothur_list_file, mothur_group_file=mothur_group_file)
+#' # # Returns an error
+#' # import_mothur(mothur_list_file)
+#' # # Should return an "OMG, you must provide the list file" error
+#' # import_mothur()
+import_mothur <- function(mothur_list_file=NULL, mothur_group_file=NULL,
+ mothur_tree_file=NULL, cutoff=NULL,
+ mothur_shared_file=NULL, mothur_constaxonomy_file=NULL, parseFunction=parse_taxonomy_default){
+
+ pslist = vector("list")
+
+ if( !is.null(mothur_group_file) & !is.null(mothur_list_file) ){
+ # If list & group files provided, you can make an OTU table.
+ groupOTU = import_mothur_otu_table(mothur_list_file, mothur_group_file, cutoff)
+ pslist = c(pslist, list(groupOTU))
+ }
+
+ if( !is.null(mothur_tree_file) ){
+ tree <- read_tree(mothur_tree_file)
+ pslist = c(pslist, list(tree))
+ }
+
+ if( !is.null(mothur_shared_file) ){
+ OTUshared <- import_mothur_shared(mothur_shared_file)
+ pslist = c(pslist, list(OTUshared))
+ }
+
+ if( !is.null(mothur_constaxonomy_file) ){
+ tax <- import_mothur_constaxonomy(mothur_constaxonomy_file, parseFunction)
+ pslist = c(pslist, list(tax))
+ }
+
+ return(do.call("phyloseq", pslist))
+}
+################################################################################
+#' Import mothur-formatted distance file
+#'
+#' The mothur application will produce a file containing the pairwise distances
+#' between all sequences in a dataset. This distance matrix can be the basis for
+#' OTU cluster designations. R also has many built-in or off-the-shelf tools for
+#' dealing with distance matrices.
+#'
+#' @usage import_mothur_dist(mothur_dist_file)
+#'
+#' @param mothur_dist_file Required. The distance file name / location produced by \emph{mothur}.
+#'
+#' @return A distance matrix object describing all sequences in a dataset.
+#'
+#' @export
+#'
+#' @seealso \code{\link{import_mothur}}
+#'
+#' @examples
+#' # # Take a look at the dataset shown here as an example:
+#' # # "http://www.mothur.org/wiki/Esophageal_community_analysis"
+#' # # find the file ending with extension ".dist", download to your system
+#' # # The location of your file may vary
+#' # mothur_dist_file <- "~/Downloads/mothur/Esophagus/esophagus.dist"
+#' # myNewDistObject <- import_mothur_dist(mothur_dist_file)
+import_mothur_dist <- function(mothur_dist_file){
+ # Read the raw distance matrix file produced by mothur:
+ raw_dist_lines <- readLines(mothur_dist_file)
+
+ # split each line on white space, and begin modifying into dist-matrix format
+ dist_char <- strsplit(raw_dist_lines, "[[:space:]]+")
+ dist_char <- dist_char[-1]
+ # add name to each list element
+ names(dist_char) <- sapply(dist_char, function(i){i[1]})
+ # pop out the names from each vector
+ dist_char <- sapply(dist_char, function(i){i[-1]})
+ # convert to numeric vectors
+ dist_char <- sapply(dist_char, function(i){as(i, "numeric")})
+
+ # Initialize and fill the matrix
+ distm <- matrix(0, nrow=length(dist_char), ncol=length(dist_char))
+ rownames(distm) <- names(dist_char); colnames(distm) <- names(dist_char)
+ for( i in names(dist_char)[-1] ){
+ distm[i, 1:length(dist_char[[i]])] <- dist_char[[i]]
+ }
+ diag(distm) <- 1
+ distd <- as.dist(distm)
+ return(distd)
+}
+################################################################################
+################################################################################
+#' Export a distance object as \code{.names} and \code{.dist} files for mothur
+#'
+#' The purpose of this function is to allow a user to easily export a distance object
+#' as a pair of files that can be immediately imported by mothur for OTU clustering
+#' and related analysis. A distance object can be created in \code{R} in a number of
+#' ways, including via cataloguing the cophentic distances of a tree object.
+#'
+#' @usage export_mothur_dist(x, out=NULL, makeTrivialNamesFile=NULL)
+#'
+#' @param x (Required). A \code{"dist"} object, or a symmetric matrix.
+#'
+#' @param out (Optional). The desired output filename for the \code{.dist} file, OR
+#' left \code{NULL}, the default, in which case the mothur-formated distance table
+#' is returned to \code{R} standard out.
+#'
+#' @param makeTrivialNamesFile (Optional). Default \code{NULL}. The desired name of the \code{.names} file.
+#' If left \code{NULL}, the file name will be a modified version of the \code{out} argument.
+#'
+#' @return A character vector of the different cutoff values contained in the file.
+#' For a given set of arguments to the \code{cluster()} command from within
+#' \emph{mothur}, a number of OTU-clustering results are returned in the same
+#' list file. The exact cutoff values used by \emph{mothur} can vary depending
+#' on the input data. This simple function returns the cutoffs that were actually
+#' included in the \emph{mothur} output. This an important extra step prior to
+#' importing the OTUs with the \code{import_mothur_otulist()} function.
+#'
+#' @export
+#'
+#' @examples #
+#' data(esophagus)
+#' myDistObject <- as.dist(ape::cophenetic.phylo(phy_tree(esophagus)))
+#' export_mothur_dist(myDistObject)
+export_mothur_dist <- function(x, out=NULL, makeTrivialNamesFile=NULL){
+ if( class(x)== "matrix" ){ x <- as.dist(x) }
+ if( class(x)!= "dist" ){ stop("x must be a dist object, or symm matrix") }
+
+ # While x is a dist-object, get the length of unique pairs
+ # to initialize the dist table.
+ distdf <- matrix("", nrow=length(x), ncol=3)
+
+ # Now convert x to matrix for looping, indexing.
+ x <- as(x, "matrix")
+ colnames(distdf) <- c("i", "j", "d")
+ # distdf row counter
+ z <- 1
+
+ # The big loop.
+ for( i in 2:nrow(x) ){ # i <- 2
+ thisvec <- x[i, 1:(i-1)]
+ for( j in 1:length(thisvec) ){ # j <- 1
+ distdf[z, "i"] <- rownames(x)[i]
+ distdf[z, "j"] <- colnames(x)[j]
+ distdf[z, "d"] <- thisvec[j]
+ z <- z + 1
+ }
+ }
+
+ # mothur requires a .names file, in case you removed identical sequences
+ # from within mothur and need to keep track and add them back.
+ if( !is.null(makeTrivialNamesFile) ){
+ namestab <- matrix(rownames(x), nrow=length(rownames(x)), ncol=2)
+ write.table(namestab, file=makeTrivialNamesFile, quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE)
+ }
+
+ # If is.null(out)==TRUE, then return two-column table.
+ # If it's a character, write.table-it
+ if( is.null(out) ){
+ return(distdf)
+ } else {
+ write.table(distdf, file=out, quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE)
+ }
+}
+################################################################################
+#' Export environment (ENV) file for UniFrac Server.
+#'
+#' Creates the environment table that is needed for the original UniFrac
+#' algorithm. Useful for cross-checking, or if want to use UniFrac server.
+#' Optionally the ENV-formatted table can be returned to the \code{R}
+#' workspace, and the tree component can be exported as Nexus format
+#' (Recommended).
+#'
+#' @param physeq (Required). Experiment-level (\code{\link{phyloseq-class}}) object.
+#' Ideally this also contains the phylogenetic tree, which is also exported by default.
+#'
+#' @param file (Optional). The file path for export. If not-provided, the
+#' expectation is that you will want to set \code{return} to \code{TRUE},
+#' and manipulate the ENV table on your own. Default is \code{""}, skipping
+#' the ENV file from being written to a file.
+#'
+#' @param writeTree (Optional). Write the phylogenetic tree as well as the
+#' the ENV table. Default is \code{TRUE}.
+#'
+#' @param return (Optional). Should the ENV table be returned to the R workspace?
+#' Default is \code{FALSE}.
+#'
+#' @importFrom ape write.nexus
+#' @export
+#'
+#' @examples
+#' # # Load example data
+#' # data(esophagus)
+#' # export_env_file(esophagus, "~/Desktop/esophagus.txt")
+export_env_file <- function(physeq, file="", writeTree=TRUE, return=FALSE){
+ # data(esophagus)
+ # physeq <- esophagus
+
+ # Create otu_table matrix and force orientation
+ OTU <- as(otu_table(physeq), "matrix")
+ if( !taxa_are_rows(physeq) ){ OTU <- t(OTU) }
+
+ # initialize sequence/sample names
+ seqs <- taxa_names(physeq)
+ samples <- sample_names(physeq)
+
+ # initialize output table as matrix
+ ENV <- matrix("", nrow=sum(OTU >= 1), ncol=3)
+
+ # i counts the row of the output , ENV
+ i=1
+ while( i < nrow(ENV) ){
+ for( j in seqs){
+ for( k in which(OTU[j, ]>0) ){
+ ENV[i, 1] <- j
+ ENV[i, 2] <- samples[k]
+ ENV[i, 3] <- OTU[j, k]
+ i <- i + 1
+ }
+ }
+ }
+ # If a file path is provided, write the table to that file
+ if(file != ""){
+ write.table(ENV, file=file, quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE)
+ }
+
+ # If needed, also write the associated tree-file.
+ if( writeTree ){
+ fileTree <- paste(file, ".nex", sep="")
+ write.nexus(phy_tree(physeq), file=fileTree, original.data=FALSE)
+ }
+
+ # If return argument is TRUE, return the environment table
+ if(return){ return(ENV) }
+}
+################################################################################
+# UniFrac ENV files have the form:
+#
+# SEQ1 ENV1 1
+# SEQ1 ENV2 2
+# SEQ2 ENV1 15
+# SEQ3 ENV1 2
+# SEQ4 ENV2 8
+# SEQ5 ENV1 4
+# http://128.138.212.43/unifrac/help.psp#env_file
+################################################################################
+#' Import phyloseq data from biom-format file
+#'
+#' New versions of QIIME produce a more-comprehensive and formally-defined
+#' JSON file format, called biom file format:
+#'
+#' ``The biom file format (canonically pronounced `biome') is designed to be a
+#' general-use format for representing counts of observations in one or
+#' more biological samples. BIOM is a recognized standard for the Earth Microbiome
+#' Project and is a Genomics Standards Consortium candidate project.''
+#'
+#' \url{http://biom-format.org/}
+#'
+#' @usage import_biom(BIOMfilename,
+#' treefilename=NULL, refseqfilename=NULL, refseqFunction=readDNAStringSet, refseqArgs=NULL,
+#' parseFunction=parse_taxonomy_default, parallel=FALSE, version=1.0, ...)
+#'
+#' @param BIOMfilename (Required). A character string indicating the
+#' file location of the BIOM formatted file. This is a JSON formatted file,
+#' specific to biological datasets, as described in
+#' \url{http://www.qiime.org/svn_documentation/documentation/biom_format.html}{the biom-format home page}.
+#' In principle, this file should include you OTU abundance data (OTU table),
+#' your taxonomic classification data (taxonomy table), as well as your
+#' sample data, for instance what might be in your ``sample map'' in QIIME.
+#' A phylogenetic tree is not yet supported by biom-format, and so is a
+#' separate argument here. If, for some reason, your biom-format file is
+#' missing one of these mentioned data types but you have it in a separate file,
+#' you can first import the data that is in the biom file using this function,
+#' \code{import_biom}, and then ``merge'' the remaining data after you have
+#' imported with other tools using the relatively general-purpose data
+#' merging function called \code{\link{merge_phyloseq}}.
+#'
+#' @param treefilename (Optional). Default value is \code{NULL}.
+#' A file representing a phylogenetic tree
+#' or a \code{\link{phylo}} object.
+#' Files can be NEXUS or Newick format.
+#' See \code{\link{read_tree}} for more details.
+#' Also, if using a recent release of the GreenGenes database tree,
+#' try the \code{\link{read_tree_greengenes}} function --
+#' this should solve some issues specific to importing that tree.
+#' If provided, the tree should have the same OTUs/tip-labels
+#' as the OTUs in the other files.
+#' Any taxa or samples missing in one of the files is removed from all.
+#' As an example from the QIIME pipeline,
+#' this tree would be a tree of the representative 16S rRNA sequences from each OTU
+#' cluster, with the number of leaves/tips equal to the number of taxa/species/OTUs,
+#' or the complete reference database tree that contains the OTU identifiers
+#' of every OTU in your abundance table.
+#' Note that this argument can be a tree object (\code{\link[ape]{phylo}}-class)
+#' for cases where the tree has been --- or needs to be --- imported separately,
+#' as in the case of the GreenGenes tree mentioned earlier (code{\link{read_tree_greengenes}}).
+#'
+#' @param refseqfilename (Optional). Default \code{NULL}.
+#' The file path of the biological sequence file that contains at a minimum
+#' a sequence for each OTU in the dataset.
+#' Alternatively, you may provide an already-imported
+#' \code{\link[Biostrings]{XStringSet}} object that satisfies this condition.
+#' In either case, the \code{\link{names}} of each OTU need to match exactly the
+#' \code{\link{taxa_names}} of the other components of your data.
+#' If this is not the case, for example if the data file is a FASTA format but
+#' contains additional information after the OTU name in each sequence header,
+#' then some additional parsing is necessary,
+#' which you can either perform separately before calling this function,
+#' or describe explicitly in a custom function provided in the (next) argument,
+#' \code{refseqFunction}.
+#' Note that the \code{\link[Biostrings]{XStringSet}} class can represent any
+#' arbitrary sequence, including user-defined subclasses, but is most-often
+#' used to represent RNA, DNA, or amino acid sequences.
+#' The only constraint is that this special list of sequences
+#' has exactly one named element for each OTU in the dataset.
+#'
+#' @param refseqFunction (Optional).
+#' Default is \code{\link[Biostrings]{readDNAStringSet}},
+#' which expects to read a fasta-formatted DNA sequence file.
+#' If your reference sequences for each OTU are amino acid, RNA, or something else,
+#' then you will need to specify a different function here.
+#' This is the function used to read the file connection provided as the
+#' the previous argument, \code{refseqfilename}.
+#' This argument is ignored if \code{refseqfilename} is already a
+#' \code{\link[Biostrings]{XStringSet}} class.
+#'
+#' @param refseqArgs (Optional).
+#' Default \code{NULL}.
+#' Additional arguments to \code{refseqFunction}.
+#' See \code{\link[Biostrings]{XStringSet-io}} for details about
+#' additional arguments to the standard read functions in the Biostrings package.
+#'
+#' @param parseFunction (Optional). A function. It must be a function that
+#' takes as its first argument a character vector of taxonomic rank labels
+#' for a single OTU
+#' and parses and names each element
+#' (an optionally removes unwanted elements).
+#' Further details and examples of acceptable functions are provided
+#' in the documentation for \code{\link{parse_taxonomy_default}}.
+#' There are many variations on taxonomic nomenclature, and naming
+#' conventions used to store that information in various taxonomic
+#' databases and phylogenetic assignment algorithms. A popular database,
+#' \url{http://greengenes.lbl.gov/cgi-bin/nph-index.cgi}{greengenes},
+#' has its own custom parsing function provided in the phyloseq package,
+#' \code{\link{parse_taxonomy_greengenes}},
+#' and more can be contributed or posted as code snippets as needed.
+#' They can be custom-defined by a user immediately prior to the the call to
+#' \code{\link{import_biom}}, and this is a suggested first step to take
+#' when trouble-shooting taxonomy-related errors during file import.
+#'
+#' @param parallel (Optional). Logical. Wrapper option for \code{.parallel}
+#' parameter in \code{plyr-package} functions. If \code{TRUE}, apply
+#' parsing functions in parallel, using parallel backend provided by
+#' \code{\link{foreach}} and its supporting backend packages. One caveat,
+#' plyr-parallelization currently works most-cleanly with \code{multicore}-like
+#' backends (Mac OS X, Unix?), and may throw warnings for SNOW-like backends.
+#' See the example below for code invoking multicore-style backend within
+#' the \code{doParallel} package.
+#'
+#' Finally, for many datasets a parallel import should not be necessary
+#' because a serial import will be just as fast and the import is often only
+#' performed one time; after which the data should be saved as an RData file
+#' using the \code{\link{save}} function.
+#'
+#' @param version (Optional). Numeric. The expected version number of the file.
+#' As the BIOM format evolves, version-specific importers may be available
+#' by adjusting the version value. Default is \code{1.0}.
+#' Not yet implemented. Parsing of the biom-format is done mostly
+#' by the biom package now available in CRAN.
+#'
+#' @param ... Additional parameters passed on to \code{\link{read_tree}}.
+#'
+#' @return A \code{\link{phyloseq-class}} object.
+#'
+#' @seealso
+#' \code{\link{import}}
+#'
+#' \code{\link{import_qiime}}
+#'
+#' \code{\link{read_tree}}
+#'
+#' \code{\link{read_tree_greengenes}}
+#'
+#' \code{\link[biomformat]{read_biom}}
+#'
+#' \code{\link[biomformat]{biom_data}}
+#'
+#' \code{\link[biomformat]{sample_metadata}}
+#'
+#' \code{\link[biomformat]{observation_metadata}}
+#'
+#' \code{\link[Biostrings]{XStringSet-io}}
+#'
+#' @references \href{http://www.qiime.org/svn_documentation/documentation/biom_format.html}{biom-format}
+#'
+#' @importFrom Biostrings readDNAStringSet
+#' @importFrom biomformat read_biom
+#' @importFrom biomformat sample_metadata
+#' @importFrom biomformat biom_data
+#' @importFrom biomformat observation_metadata
+#'
+#' @export
+#'
+#' @examples
+#' # An included example of a rich dense biom file
+#' rich_dense_biom <- system.file("extdata", "rich_dense_otu_table.biom", package="phyloseq")
+#' import_biom(rich_dense_biom, parseFunction=parse_taxonomy_greengenes)
+#' # An included example of a sparse dense biom file
+#' rich_sparse_biom <- system.file("extdata", "rich_sparse_otu_table.biom", package="phyloseq")
+#' import_biom(rich_sparse_biom, parseFunction=parse_taxonomy_greengenes)
+#' # # # Example code for importing large file with parallel backend
+#' # library("doParallel")
+#' # registerDoParallel(cores=6)
+#' # import_biom("my/file/path/file.biom", parseFunction=parse_taxonomy_greengenes, parallel=TRUE)
+import_biom <- function(BIOMfilename,
+ treefilename=NULL, refseqfilename=NULL, refseqFunction=readDNAStringSet, refseqArgs=NULL,
+ parseFunction=parse_taxonomy_default, parallel=FALSE, version=1.0, ...){
+
+ # initialize the argument-list for phyloseq. Start empty.
+ argumentlist <- list()
+
+ # Read the data
+ if(class(BIOMfilename)=="character"){
+ x = read_biom(biom_file=BIOMfilename)
+ } else if (class(BIOMfilename)=="biom"){
+ x = BIOMfilename
+ } else {
+ stop("import_biom requires a 'character' string to a biom file or a 'biom-class' object")
+ }
+
+ ########################################
+ # OTU table:
+ ########################################
+ otutab = otu_table(as(biom_data(x), "matrix"), taxa_are_rows=TRUE)
+ argumentlist <- c(argumentlist, list(otutab))
+
+ ########################################
+ # Taxonomy Table
+ ########################################
+ # Need to check if taxonomy information is empty (minimal BIOM file)
+ if( all( sapply(sapply(x$rows, function(i){i$metadata}), is.null) ) ){
+ taxtab <- NULL
+ } else {
+ # parse once each character vector, save as a list
+ taxlist = lapply(x$rows, function(i){
+ parseFunction(i$metadata$taxonomy)
+ })
+ names(taxlist) = sapply(x$rows, function(i){i$id})
+ taxtab = build_tax_table(taxlist)
+ }
+ argumentlist <- c(argumentlist, list(taxtab))
+
+ ########################################
+ # Sample Data ("columns" in QIIME/BIOM)
+ ########################################
+ # If there is no metadata (all NULL), then set sam_data <- NULL
+ if( is.null(sample_metadata(x)) ){
+ samdata <- NULL
+ } else {
+ samdata = sample_data(sample_metadata(x))
+ }
+ argumentlist <- c(argumentlist, list(samdata))
+
+ ########################################
+ # Tree data
+ ########################################
+ if( !is.null(treefilename) ){
+ if( inherits(treefilename, "phylo") ){
+ # If argument is already a tree, don't read, just assign.
+ tree = treefilename
+ } else {
+ # NULL is silently returned if tree is not read properly.
+ tree <- read_tree(treefilename, ...)
+ }
+ # Add to argument list or warn
+ if( is.null(tree) ){
+ warning("treefilename failed import. It not included.")
+ } else {
+ argumentlist <- c(argumentlist, list(tree) )
+ }
+ }
+
+ ########################################
+ # Reference Sequence data
+ ########################################
+ if( !is.null(refseqfilename) ){
+ if( inherits(refseqfilename, "XStringSet") ){
+ # If argument is already a XStringSet, don't read, just assign.
+ refseq = refseqfilename
+ } else {
+ # call refseqFunction and read refseqfilename, either with or without additional args
+ if( !is.null(refseqArgs) ){
+ refseq = do.call("refseqFunction", c(list(refseqfilename), refseqArgs))
+ } else {
+ refseq = refseqFunction(refseqfilename)
+ }
+ }
+ argumentlist <- c(argumentlist, list(refseq) )
+ }
+
+ ########################################
+ # Put together into a phyloseq object
+ ########################################
+ return( do.call("phyloseq", argumentlist) )
+
+}
+################################################################################
+# Need to export these parsing functions as examples...
+################################################################################
+#' Parse elements of a taxonomy vector
+#'
+#' These are provided as both example and default functions for
+#' parsing a character vector of taxonomic rank information for a single taxa.
+#' As default functions, these are intended for cases where the data adheres to
+#' the naming convention used by greengenes
+#' (\url{http://greengenes.lbl.gov/cgi-bin/nph-index.cgi})
+#' or where the convention is unknown, respectively.
+#' To work, these functions -- and any similar custom function you may want to
+#' create and use -- must take as input a single character vector of taxonomic
+#' ranks for a single OTU, and return a \strong{named} character vector that has
+#' been modified appropriately (according to known naming conventions,
+#' desired length limits, etc.
+#' The length (number of elements) of the output named vector does \strong{not}
+#' need to be equal to the input, which is useful for the cases where the
+#' source data files have extra meaningless elements that should probably be
+#' removed, like the ubiquitous
+#' ``Root'' element often found in greengenes/QIIME taxonomy labels.
+#' In the case of \code{parse_taxonomy_default}, no naming convention is assumed and
+#' so dummy rank names are added to the vector.
+#' More usefully if your taxonomy data is based on greengenes, the
+#' \code{parse_taxonomy_greengenes} function clips the first 3 characters that
+#' identify the rank, and uses these to name the corresponding element according
+#' to the appropriate taxonomic rank name used by greengenes
+#' (e.g. \code{"p__"} at the beginning of an element means that element is
+#' the name of the phylum to which this OTU belongs).
+#' Most importantly, the expectations for these functions described above
+#' make them compatible to use during data import,
+#' specifcally the \code{\link{import_biom}} function, but
+#' it is a flexible structure that will be implemented soon for all phyloseq
+#' import functions that deal with taxonomy (e.g. \code{\link{import_qiime}}).
+#'
+#' @usage parse_taxonomy_default(char.vec)
+#' @usage parse_taxonomy_greengenes(char.vec)
+#' @usage parse_taxonomy_qiime(char.vec)
+#'
+#' @param char.vec (Required). A single character vector of taxonomic
+#' ranks for a single OTU, unprocessed (ugly).
+#'
+#' @return A character vector in which each element is a different
+#' taxonomic rank of the same OTU, and each element name is the name of
+#' the rank level. For example, an element might be \code{"Firmicutes"}
+#' and named \code{"phylum"}.
+#' These parsed, named versions of the taxonomic vector should
+#' reflect embedded information, naming conventions,
+#' desired length limits, etc; or in the case of \code{\link{parse_taxonomy_default}},
+#' not modified at all and given dummy rank names to each element.
+#'
+#' @rdname parseTaxonomy-functions
+#' @export
+#'
+#' @seealso
+#' \code{\link{import_biom}}
+#' \code{\link{import_qiime}}
+#'
+#' @examples
+#' taxvec1 = c("Root", "k__Bacteria", "p__Firmicutes", "c__Bacilli", "o__Bacillales", "f__Staphylococcaceae")
+#' parse_taxonomy_default(taxvec1)
+#' parse_taxonomy_greengenes(taxvec1)
+#' taxvec2 = c("Root;k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Staphylococcaceae")
+#' parse_taxonomy_qiime(taxvec2)
+parse_taxonomy_default = function(char.vec){
+ # Remove any leading empty space
+ char.vec = gsub("^[[:space:]]{1,}", "", char.vec)
+ # Remove any trailing space
+ char.vec = gsub("[[:space:]]{1,}$", "", char.vec)
+ if( length(char.vec) > 0 ){
+ # Add dummy element (rank) name
+ names(char.vec) = paste("Rank", 1:length(char.vec), sep="")
+ } else {
+ warning("Empty taxonomy vector encountered.")
+ }
+ return(char.vec)
+}
+#' @rdname parseTaxonomy-functions
+#' @aliases parse_taxonomy_default
+#' @export
+parse_taxonomy_greengenes <- function(char.vec){
+ # Use default to assign names to elements in case problem with greengenes prefix
+ char.vec = parse_taxonomy_default(char.vec)
+ # Define the meaning of each prefix according to GreenGenes taxonomy
+ Tranks = c(k="Kingdom", p="Phylum", c="Class", o="Order", f="Family", g="Genus", s="Species")
+ # Check for prefix using regexp, warn if there were none. trim indices, ti
+ ti = grep("[[:alpha:]]{1}\\_\\_", char.vec)
+ if( length(ti) == 0L ){
+ warning(
+ "No greengenes prefixes were found. \n",
+ "Consider using parse_taxonomy_default() instead if true for all OTUs. \n",
+ "Dummy ranks may be included among taxonomic ranks now."
+ )
+ # Will want to return without further modifying char.vec
+ taxvec = char.vec
+ # Replace names of taxvec according to prefix, if any present...
+ } else {
+ # Remove prefix using sub-"" regexp, call result taxvec
+ taxvec = gsub("[[:alpha:]]{1}\\_\\_", "", char.vec)
+ # Define the ranks that will be replaced
+ repranks = Tranks[substr(char.vec[ti], 1, 1)]
+ # Replace, being sure to avoid prefixes not present in Tranks
+ names(taxvec)[ti[!is.na(repranks)]] = repranks[!is.na(repranks)]
+ }
+ return(taxvec)
+}
+#' @rdname parseTaxonomy-functions
+#' @aliases parse_taxonomy_default
+#' @export
+parse_taxonomy_qiime <- function(char.vec){
+ parse_taxonomy_greengenes(strsplit(char.vec, ";", TRUE)[[1]])
+}
+################################################################################
+#' Build a \code{\link{tax_table}} from a named possibly-jagged list
+#'
+#' @param taxlist (Required). A list in which each element is a vector of
+#' taxonomic assignments named by rank.
+#' Every element of every vector must be named by the rank it represents.
+#' Every element of the list (every vector) should correspond to a single OTU
+#' and be named for that OTU.
+#'
+#' @return A \code{\link{tax_table}} (\code{\link{taxonomyTable-class}}) that
+#' has been built from \code{taxlist}. The OTU names of this output will be
+#' the element names of \code{taxlist}, and a separate taxonomic rank
+#' (column) will be included for each unique rank found among the element names
+#' of each vector in the list. \code{NA_character_} is the default value of
+#' elements in the \code{\link{tax_table}} for which there is no corresponding
+#' information in \code{taxlist}.
+#'
+#' @seealso
+#' \code{\link{import_biom}}
+#' \code{\link{import_qiime}}
+#'
+#' @export
+#'
+#' @examples
+#' taxvec1 = c("Root", "k__Bacteria", "p__Firmicutes", "c__Bacilli", "o__Bacillales", "f__Staphylococcaceae")
+#' parse_taxonomy_default(taxvec1)
+#' parse_taxonomy_greengenes(taxvec1)
+#' taxvec2 = c("Root;k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Staphylococcaceae")
+#' parse_taxonomy_qiime(taxvec2)
+#' taxlist1 = list(OTU1=parse_taxonomy_greengenes(taxvec1), OTU2=parse_taxonomy_qiime(taxvec2))
+#' taxlist2 = list(OTU1=parse_taxonomy_default(taxvec1), OTU2=parse_taxonomy_qiime(taxvec2))
+#' build_tax_table(taxlist1)
+#' build_tax_table(taxlist2)
+build_tax_table = function(taxlist){
+ # Determine column headers (rank names) of taxonomy table
+ columns = unique(unlist(lapply(taxlist, names)))
+ # Initialize taxonomic character matrix
+ taxmat <- matrix(NA_character_, nrow=length(taxlist), ncol=length(columns))
+ colnames(taxmat) = columns
+ # Fill in the matrix by row.
+ for( i in 1:length(taxlist) ){
+ # Protect against empty taxonomy
+ if( length(taxlist[[i]]) > 0 ){
+ # The extra column name check solves issues with raggedness, and disorder.
+ taxmat[i, names(taxlist[[i]])] <- taxlist[[i]]
+ }
+ }
+ # Convert functionally empty elements, "", to NA
+ taxmat[taxmat==""] <- NA_character_
+ # Now coerce to matrix, name the rows as "id" (the taxa name), coerce to taxonomyTable
+ taxmat <- as(taxmat, "matrix")
+ rownames(taxmat) = names(taxlist)
+ return( tax_table(taxmat) )
+}
+################################################################################
+################################################################################
+################################################################################
+#' Import microbio.me/qiime (QIIME-DB) data package
+#'
+#' Originally, this function was for accessing microbiome datasets from the
+#' \href{http://www.microbio.me/qiime/index.psp}{microbio.me/qiime}
+#' public repository from within R.
+#' As you can see by clicking on the above link,
+#' the QIIME-DB sever is down indefinitely.
+#' However, this function will remain supported here
+#' in case the FTP server goes back up,
+#' and also for phyloseq users that have downloaded
+#' one or more data packages prior to the server going down.
+#'
+#' @param zipftp (Required). A character string that is the full URL
+#' path to a zipped file that follows the file naming conventions used by
+#' \href{http://www.microbio.me/qiime/index.psp}{microbio.me/qiime}.
+#' Alternatively, you can simply provide the study number
+#' as a single \code{\link{integer}} or other single-length vector
+#' that can be \code{\link{coerce}}d to an integer;
+#' this function will complete the remainder of the ftp URL hosted at
+#' \href{http://www.microbio.me/qiime/index.psp}{microbio.me/qiime}.
+#' For example, instead of the full URL string,
+#' \code{"ftp://thebeast.colorado.edu/pub/QIIME_DB_Public_Studies/study_494_split_library_seqs_and_mapping.zip"},
+#' you could simply provide \code{494} or \code{"494"}
+#' as the first (`zipftp`) argument.
+#'
+#' @param ext (Optional). A \code{\link{character}} string of the expected
+#' file extension, which also indicates the compression type,
+#' if \code{zipftp} is a study number instead of the full path.
+#' Note that this argument has no effect if \code{zipftp} is the full path,
+#' in which case the file extension is read directly from the downloaded file.
+#'
+#' @param parsef (Optional). The type of taxonomic parsing to use for the
+#' OTU taxonomic classification, in the \code{.biom} file, if present.
+#' This is passed on to \code{\link{import_biom}}, but unlike that function
+#' the default parsing function is \code{\link{parse_taxonomy_greengenes}},
+#' rather than \code{\link{parse_taxonomy_default}}, because we know
+#' ahead of time that most (or all?) of the taxonomic classifications
+#' in the \code{microbio.me/qiime} repository will be based on
+#' GreenGenes.
+#'
+#' @param ... (Optional, for advanced users). Additional arguments passed to
+#' \code{\link{download.file}}. This is mainly for non-standard links to
+#' resources (in this case, a zipped file) that are not being hosted by
+#' \href{http://www.microbio.me/qiime/index.psp}{microbio.me/qiime}.
+#' If you are using a FTP address or study number from their servers,
+#' then you shouldn't need to provide any additional arguments.
+#'
+#' @return
+#' A \code{\link{phyloseq-class}} object if possible, a component if only a
+#' component could be imported, or \code{NULL} if nothing could be imported
+#' after unzipping the file. Keep in mind there is a specific naming-convention
+#' that is expected based on the current state of the
+#' \href{http://www.microbio.me/qiime/index.psp}{microbio.me/qiime}
+#' servers. Several helpful messages are \code{\link{cat}}ted to standard out
+#' to help let you know the ongoing status of the current
+#' download and import process.
+#'
+#' @seealso
+#' See \code{\link{download.file}} and \code{\link{url}}
+#' for details about URL formats --
+#' including local file addresses -- that might work here.
+#'
+#' \code{\link{import_biom}}
+#'
+#' \code{\link{import_qiime}}
+#'
+#' @export
+#' @examples
+#' # This should return TRUE on your system if you have internet turned on
+#' # and a standard R installation. Indicates whether this is likely to
+#' # work on your system for a URL or local file, respectively.
+#' capabilities("http/ftp"); capabilities("fifo")
+#' # A working example with a local example file included in phyloseq
+#' zipfile = "study_816_split_library_seqs_and_mapping.zip"
+#' zipfile = system.file("extdata", zipfile, package="phyloseq")
+#' tarfile = "study_816_split_library_seqs_and_mapping.tar.gz"
+#' tarfile = system.file("extdata", tarfile, package="phyloseq")
+#' tarps = microbio_me_qiime(tarfile)
+#' zipps = microbio_me_qiime(zipfile)
+#' identical(tarps, zipps)
+#' tarps; zipps
+#' plot_heatmap(tarps)
+#' # An example that used to work, before the QIIME-DB server was turned off by its host.
+#' # # Smokers dataset
+#' # smokezip = "ftp://thebeast.colorado.edu/pub/QIIME_DB_Public_Studies/study_524_split_library_seqs_and_mapping.zip"
+#' # smokers1 = microbio_me_qiime(smokezip)
+#' # # Alternatively, just use the study number
+#' # smokers2 = microbio_me_qiime(524)
+#' # identical(smokers1, smokers2)
+microbio_me_qiime = function(zipftp, ext=".zip", parsef=parse_taxonomy_greengenes, ...){
+ # Define naming convention
+ front = "ftp://thebeast.colorado.edu/pub/QIIME_DB_Public_Studies/study_"
+ if( !is.na(as.integer(zipftp)) ){
+ # If study number instead of string,
+ # create the ftp URL using ext and convention
+ back = paste0("_split_library_seqs_and_mapping", ext)
+ zipftp = paste0(front, zipftp, back)
+ } else {
+ # Determine file extension from the file path itself
+ ext = substring(zipftp, regexpr("\\.([[:alnum:]]+)$", zipftp)[1])
+ back = paste0("_split_library_seqs_and_mapping", ext)
+ }
+ # Check if zipftp is clearly an externally located file, ftp, http, etc.
+ externprefixes = c("http://", "https://", "ftp://")
+ prefix = regexpr("^([[:alnum:]]+)\\://", zipftp)
+ if( substr(zipftp, 1, attr(prefix, "match.length")[1]) %in% externprefixes ){
+ # If external, then create temporary file and download
+ zipfile = tempfile()
+ download.file(zipftp, zipfile, ...)
+ } else {
+ # Else it is a local zipfile
+ zipfile = zipftp
+ }
+ # Use the apparent file naming convention for microbio.me/qiime
+ # as the de facto guide for this API. In particular,
+ # the expectation o fthe study name (already used above)
+ studyname = gsub("\\_split\\_.+$", "", basename(zipftp))
+ # The output of tempdir() is always the same in the same R session
+ # To avoid conflict with multiple microbio.me/qiime unpacks
+ # in the same session, pre-pend the study name and datestamp
+ unpackdir = paste0(studyname, "_", gsub("[[:blank:][:punct:]]", "", date()))
+ # Add the temp path
+ unpackdir = file.path(tempdir(), unpackdir)
+ # Create the unpack directory if needed (most likely).
+ if( !file.exists(unpackdir) ){dir.create(unpackdir)}
+ # Unpack to the temporary directory using unzip or untar
+ if( ext == ".zip" ){
+ unzip(zipfile, exdir=unpackdir, overwrite=TRUE)
+ } else if( ext %in% c("tar.gz", ".tgz", ".gz", ".gzip", ".bzip2", ".xz") ){
+ # untar the tarfile to the new temp dir
+ untar(zipfile, exdir=unpackdir)
+ } else {
+ # The compression format was not recognized. Provide informative error msg.
+ msg = paste("Could not determine the compression type.",
+ "Expected extensions are (mostly):",
+ ".zip, .tgz, .tar.gz", sep="\n")
+ stop(msg)
+ }
+ # Define a list of imported objects that might grow
+ # if the right file types are present and imported correctly.
+ imported_objects = vector("list")
+ # Search recursively in the unpacked directory for the .biom file
+ # and parse if it is.
+ # There should be only one. Throw warning if more than one, take the first.
+ biomfile = list.files(unpackdir, "\\.biom", full.names=TRUE, recursive=TRUE)
+ if( length(biomfile) > 1 ){
+ warning("more than one .biom file found in compressed archive. Importing first only.")
+ biomfile = biomfile[1]
+ } else if( length(biomfile) == 1 ){
+ cat("Found biom-format file, now parsing it... \n")
+ biom = import_biom(biomfile, parseFunction=parsef)
+ cat("Done parsing biom... \n")
+ imported_objects = c(imported_objects, list(biom))
+ }
+ # Check if sample_data (qiime mapping) file present, and parse if it is.
+ sdfile = list.files(unpackdir, "\\_mapping\\_file\\.txt", full.names=TRUE, recursive=TRUE)
+ if( length(sdfile) > 1 ){
+ warning("more than one mapping file found in compressed archive. Importing first only.")
+ sdfile = sdfile[1]
+ } else if( length(sdfile)==1 ){
+ cat("Importing Sample Metdadata from mapping file...", fill=TRUE)
+ sample_metadata = import_qiime_sample_data(sdfile)
+ imported_objects = c(imported_objects, list(sample_metadata))
+ }
+ # Check success, notify user, and return.
+ if( length(imported_objects) > 1 ){
+ # If there are more than one imported objects, merge them and return
+ cat("Merging the imported objects... \n")
+ physeq = do.call("merge_phyloseq", imported_objects)
+ if( inherits(physeq, "phyloseq") ){
+ cat("Successfully merged, phyloseq-class created. \n Returning... \n")
+ }
+ return(physeq)
+ } else if( length(imported_objects) == 1 ){
+ cat("Note: only on object in the zip file was imported. \n")
+ cat("It was ", class(imported_objects[[1]]), " class. \n")
+ return(imported_objects[[1]])
+ } else {
+ cat("PLEASE NOTE: No objects were imported. \n",
+ "You chould check the zip file, \n",
+ "as well as the naming conventions in the zipfile \n",
+ "to make sure that they match microbio.me/qiime. \n",
+ "Instead returning NULL... \n")
+ return(NULL)
+ }
+}
+################################################################################
+#' Import usearch table format (\code{.uc}) to OTU table
+#'
+#' UPARSE is an algorithm for OTU-clustering implemented within usearch.
+#' At last check, the UPARSE algortihm was accessed via the
+#' \code{-cluster_otu} option flag.
+#' For details about installing and running usearch, please refer to the
+#' \href{http://drive5.com/usearch/}{usearch website}.
+#' For details about the output format, please refer to the
+#' \href{http://www.drive5.com/usearch/manual/opt_uc.html}{uc format definition}.
+#' This importer is intended to read a particular table format output
+#' that is generated by usearch,
+#' its so-called ``cluster format'',
+#' a file format that is often given the \code{.uc} extension
+#' in usearch documentation.
+#'
+#' Because usearch is an external (non-R) application, there is no direct
+#' way to continuously check that these suggested arguments and file formats will
+#' remain in their current state.
+#' If there is a problem, please verify your version of usearch,
+#' create a small reproducible example of the problem,
+#' and post it as an issue on the phyloseq issues tracker.
+#' The version of usearch upon which this import function
+#' was created is \code{7.0.109}.
+#' Hopefully later versions of usearch maintain this function and format,
+#' but the phyloseq team has no way to guarantee this,
+#' and so any feedback about this will help maintain future functionality.
+#' For instance, it is currently
+#' assumed that the 9th and 10th columns of the \code{.uc} table
+#' hold the read-label and OTU ID, respectively;
+#' and it is also assumed that the delimiter between sample-name and read
+#' in the read-name entries is a single \code{"_"}.
+#' If this is not true, you may have to update these parameters,
+#' or even modify the current implementation of this function.
+#'
+#' Also note that there is now a UPARSE-specific output file format,
+#' \href{http://www.drive5.com/usearch/manual/opt_uparseout.html}{uparseout},
+#' and it might make more sense to create and import that file
+#' for use in phyloseq.
+#' If so, you'll want to import using the
+#' \code{\link{import_uparse}()} function.
+#'
+#' @param ucfile (Required). A file location character string
+#' or \code{\link{connection}}
+#' corresponding to the file that contains the usearch output table.
+#' This is passed directly to \code{\link{read.table}}.
+#' Please see its \code{file} argument documentation for further
+#' links and details.
+#'
+#' @param colRead (Optional). Numeric. The column index in the uc-table
+#' file that holds the read IDs.
+#' The default column index is \code{9}.
+#'
+#' @param colOTU (Optional). Numeric. The column index in the uc-table
+#' file that holds OTU IDs.
+#' The default column index is \code{10}.
+#'
+#' @param readDelimiter (Optional). An R \code{\link{regex}} as a character string.
+#' This should be the delimiter that separates the sample ID
+#' from the original ID in the demultiplexed read ID of your sequence file.
+#' The default is plain underscore, which in this \code{\link{regex}} context
+#' is \code{"_"}.
+#'
+#' @param verbose (Optional). A \code{\link{logical}}.
+#' Default is \code{TRUE}.
+#' Should progresss messages
+#' be \code{\link{cat}}ted to standard out?
+#'
+#' @importFrom data.table fread
+#' @importFrom data.table setnames
+#' @export
+#' @seealso \code{\link{import}}
+#'
+#' \code{\link{import_biom}}
+#'
+#' \code{\link{import_qiime}}
+#'
+#' @examples
+#' usearchfile <- system.file("extdata", "usearch.uc", package="phyloseq")
+#' import_usearch_uc(usearchfile)
+import_usearch_uc <- function(ucfile, colRead=9, colOTU=10,
+ readDelimiter="_", verbose=TRUE){
+ if(verbose){cat("Reading `ucfile` into memory and parsing into table \n")}
+ # fread is one of the fastest and most-efficient importers for R.
+ # It creates a data.table object, suitable for large size objects
+ x = fread(ucfile, sep="\t", header=FALSE, na.strings=c("*", '*', "NA","N/A",""),
+ select=c(colRead, colOTU), colClasses="character", showProgress=TRUE)
+ setnames(x, c("read", "OTU"))
+ NrawEntries = nrow(x)
+ if(verbose){
+ cat("Initially read", NrawEntries, "entries. \n")
+ cat("... Now removing unassigned OTUs (* or NA)... \n")
+ }
+ x = x[!is.na(OTU), ]
+ if(verbose){
+ cat("Removed", NrawEntries - nrow(x), "entries that had no OTU assignment. \n")
+ cat("A total of", nrow(x), "will be assigned to the OTU table.\n")
+ }
+ # Process sequence label to be sample label only
+ x[, sample:=gsub(paste0(readDelimiter, ".+$"), "", read)]
+ # Convert long (melted) table into a sample-by-OTU OTU table, and return
+ OTU <- as(table(x$sample, x$OTU), "matrix")
+ # system.time({setkey(x, OTU, sample)
+ # OTU2 <- dcast.data.table(x, sample ~ OTU, fun.aggregate=length, fill=0L)
+ # })
+ return(otu_table(OTU, taxa_are_rows=FALSE))
+}
+################################################################################
+#' Import \href{http://www.drive5.com/usearch/manual/opt_uparseout.html}{UPARSE file format}
+#'
+#' UPARSE is an algorithm for OTU-clustering implemented within usearch.
+#' At last check, the UPARSE algortihm was accessed via the
+#' \code{-cluster_otu} option flag.
+#' For details about installing and running usearch, please refer to the
+#' \href{http://drive5.com/usearch/}{usearch website}.
+#' For details about the output format, please refer to the
+#' \href{http://www.drive5.com/usearch/manual/opt_uparseout.html}{uparse format definition}.
+#'
+#' Because UPARSE is an external (non-R) application, there is no direct
+#' way to continuously check that these suggested arguments and file formats will
+#' remain in their current state.
+#' If there is a problem, please verify your version of usearch,
+#' create a small reproducible example of the problem,
+#' and post it as an issue on the
+#' \href{https://github.com/joey711/phyloseq/issues}{phyloseq issues tracker}.
+#'
+#' @param upFile (Required). A file location character string
+#' or \code{\link{connection}}
+#' corresponding to the file that contains the UPARSE output table.
+#' This is passed directly to \code{\link[data.table]{fread}}.
+#' Please see its \code{file} argument documentation for further
+#' links and details.
+#'
+#' @param omitChimeras (Optional). \code{logical(1)}.
+#' Default is \code{TRUE}.
+#' Whether to omit entries that correspond to sequences/OTUs
+#' that were identified as chimeras.
+#'
+#' @param countTable (Optional). \code{logical(1)}.
+#' Default is \code{TRUE}.
+#' Whether to return the result as a wide-format table
+#' with dimensions OTU-by-sample,
+#' or to leave the table in its original sparse long-format
+#' that might be more suitable for certain \code{\link{data.table}} operations.
+#' If \code{TRUE}, entries corresponding to the same sample and OTU
+#' have their counts summed.
+#'
+#' @param OTUtable (Optional). \code{logical(1)}.
+#' Default is \code{TRUE}.
+#' Whether to coerce the result to \code{\link{otu_table}} format,
+#' or leave it as a \code{\link{data.table}} format.
+#' The former is appropriate for most \code{\link{phyloseq}} operations,
+#' the latter is useful for a lot of custom operations
+#' and custom \code{\link[ggplot2]{ggplot}2} graphics calls.
+#'
+#' @param verbose (Optional). A \code{\link{logical}}.
+#' Default is \code{TRUE}.
+#' Should progresss messages
+#' be \code{\link{cat}}ted to standard out?
+#'
+#' @importFrom data.table fread
+#' @importFrom data.table setnames
+#' @importFrom data.table setkeyv
+#'
+#' @export
+#'
+#' @seealso
+#' \code{\link{import_usearch_uc}}
+#'
+#' @examples
+#' ###
+import_uparse = function(upFile,
+ omitChimeras = TRUE,
+ countTable = TRUE,
+ OTUtable = TRUE,
+ verbose = TRUE){
+ if(verbose){message("Parsing UPARSE results table at:\n", upFile,
+ "\nSee the following for details:\n",
+ "http://www.drive5.com/usearch/manual/opt_uparseout.html")}
+ x = fread(upFile, header = FALSE)
+ setnames(x, "V5", "OTULabel")
+ if(ncol(x) > 5L){
+ # If relabel column provided, use that as OTULabel
+ setnames(x, "V6", "OTULabel")
+ }
+ setnames(x, "V1", "queryString")
+ x[, count := as.integer(gsub("^.+;size=(\\d+);$", "\\1", queryString))]
+ x[, queryID := gsub("^(.+);size=\\d+;$", "\\1", queryString)]
+ setnames(x, "V2", "Classification")
+ if(omitChimeras){
+ x <- x[(Classification != "chimera")]
+ }
+ if(countTable){
+ # If you want to create a wide-format table with summed counts
+ # key sort
+ sortVars = c("queryID", "OTULabel")
+ setkeyv(x, sortVars)
+ # turn into wide data.table
+ OTUwdt <- dcast.data.table(x, OTULabel ~ queryID,
+ value.var = "count",
+ fun.aggregate = sum,
+ fill=0L)
+ if(OTUtable){
+ # If we want an OTU table version of this
+ taxaIDvec = OTUwdt$OTULabel
+ OTUwdt[, OTULabel := NULL]
+ # Coerce to integer matrix
+ OTU <- as.matrix(OTUwdt)
+ row.names(OTU) <- taxaIDvec
+ # Coerce to OTU table and return
+ return(otu_table(OTU, taxa_are_rows=TRUE))
+ } else {
+ return(OTUwdt)
+ }
+ } else {
+ return(x[, list(OTULabel, count, queryID, Classification)])
+ }
+}
+################################################################################
+################################################################################
+################################################################################
diff --git a/R/allClasses.R b/R/allClasses.R
new file mode 100644
index 0000000..5eae197
--- /dev/null
+++ b/R/allClasses.R
@@ -0,0 +1,295 @@
+################################################################################
+#' The S4 class for storing taxa-abundance information.
+#'
+#' Because orientation of these tables can vary by method, the orientation is
+#' defined explicitly in the \code{taxa_are_rows} slot (a logical).
+#' The \code{otu_table} class inherits the \code{\link{matrix}} class to store
+#' abundance values.
+#' Various standard subset and assignment nomenclature has been extended to apply
+#' to the \code{otu_table} class, including square-bracket, \code{\link{t}}, etc.
+#'
+#' \describe{
+#' \item{taxa_are_rows}{
+#' A single logical specifying the orientation of the abundance table.
+#' }
+#'
+#' \item{.Data}{This slot is inherited from the \code{\link{matrix}} class.}
+#' }
+#' @name otu_table-class
+#' @rdname otu_table-class
+#' @exportClass otu_table
+setClass("otu_table", representation(taxa_are_rows="logical"), contains = "matrix")
+################################################################################
+#' The S4 for storing sample variables.
+#'
+#' Row indices represent samples, while column indices represent experimental
+#' categories, variables (and so forth) that describe the samples.
+#'
+#' \describe{
+#'
+#' \item{.Data}{data-frame data, inherited from the data.frame class.}
+#'
+#' \item{row.names}{
+#' Also inherited from the data.frame class;
+#' it should contain the sample names.
+#' }
+#'
+#' \item{names}{Inherited from the data.frame class.}
+#'
+#' }
+#'
+#' @name sample_data-class
+#' @rdname sample_data-class
+#' @exportClass sample_data
+setClass("sample_data", contains="data.frame")
+################################################################################
+#' An S4 class that holds taxonomic classification data as a character
+#' matrix.
+#'
+#' Row indices represent taxa, columns represent taxonomic classifiers.
+#'
+#' \describe{
+#' \item{.Data}{This slot is inherited from the \code{\link{matrix}} class.}
+#' }
+#'
+#' @name taxonomyTable-class
+#' @rdname taxonomyTable-class
+#' @exportClass taxonomyTable
+setClass("taxonomyTable", contains = "matrix")
+#metaMDS
+################################################################################
+#' S3 class placeholder definition (list) for metaMDS
+#'
+#' The ape package does export a version of its \code{\link[vegan]{metaMDS}}-class,
+#' partly because it is not really defined formally anywhere.
+#' Instead, it is an S3 class extended from the base class, \code{\link{list}} --
+#' this is a very common and easy approach --
+#' and proper behavior of any method taking an instance of this class
+#' requires exact naming conventions for element names of the list components.
+#' The phyloseq package does not provide any validity checks that a given phylo
+#' instance is valid (conforms to the conventions in the ape package)... yet.
+#' If problems arise, this might be considered, and they could be defined
+#' judiciously and within phyloseq.
+#'
+#' @seealso
+#' \code{\link[vegan]{metaMDS}}
+#'
+#' @keywords internal
+metaMDS <- structure(list(), class = "metaMDS")
+###
+# Remove if this ever works
+# @importClassesFrom vegan metaMDS
+################################################################################
+#' S3 class placeholder definition (list) for decorana
+#'
+#' The ape package does export a version of its \code{\link[vegan]{decorana}}-class,
+#' partly because it is not really defined formally anywhere.
+#' Instead, it is an S3 class extended from the base class, \code{\link{list}} --
+#' this is a very common and easy approach --
+#' and proper behavior of any method taking an instance of this class
+#' requires exact naming conventions for element names of the list components.
+#' The phyloseq package does not provide any validity checks that a given phylo
+#' instance is valid (conforms to the conventions in the ape package)... yet.
+#' If problems arise, this might be considered, and they could be defined
+#' judiciously and within phyloseq.
+#'
+#' @seealso
+#' \code{\link[vegan]{decorana}}
+#'
+#' @keywords internal
+decorana <- structure(list(), class = "decorana")
+###
+# Remove if this ever works
+# @importClassesFrom vegan decorana
+################################################################################
+#' S3 class placeholder definition (list) for dpcoa
+#'
+#' The ade4 package does not export a version of its \code{\link[ade4]{dpcoa}}-class,
+#' partly because it is not really defined formally anywhere.
+#' Instead, it is an S3 class extended from the base class, \code{\link{list}} --
+#' this is a very common and easy approach --
+#' and proper behavior of any method taking an instance of this class
+#' requires exact naming conventions for element names of the list components.
+#' The phyloseq package does not provide any validity checks that a given phylo
+#' instance is valid (conforms to the conventions in the ape package). Yet.
+#' If problems arise, this might be considered, and they could be defined
+#' judiciously and within phyloseq.
+#'
+#' An instance of this class can be produced from within phyloseq using either
+#' the \code{\link{DPCoA}} function, or the higher-level wrapping function
+#' \code{\link{ordinate}}.
+#'
+#' @seealso
+#' \code{\link[ade4]{dpcoa}}
+#'
+#' \code{\link{DPCoA}}
+#'
+#' \code{\link{ordinate}}
+#'
+#' @keywords internal
+dpcoa <- structure(list(), class = "dpcoa")
+################################################################################
+## # @keywords internal
+## print.dpcoa <- ade4:::print.dpcoa
+################################################################################
+# If this ever works
+# @importClassesFrom ade4 dpcoa
+################################################################################
+#' S3 class for ape-calculated MDS results
+#'
+#' Nothing to import, because ape doesn't (yet) export this S3 class.
+#' We will define it here, but keep it internal.
+#' For the moment, its only use is for proper dispatch in our extensions
+#' to the scores S3 generic from vegan,
+#' for generic extraction of coordinates and possibly other features from
+#' any ordination results.
+#'
+#' @keywords internal
+pcoa <- structure(list(), class = "pcoa")
+# @importMethodsFrom ape print
+# phyloseq-specific definition of "phylo" class,
+################################################################################
+#' S3 class placeholder definition (list) for phylogenetic trees.
+#'
+#' The ape package does not export a version of its \code{\link[ape]{phylo}}-class,
+#' partly because it is not really defined formally anywhere.
+#' Instead, it is an S3 class extended from the base class, \code{\link{list}} --
+#' this is a very common and easy approach --
+#' and proper behavior of any method taking an instance of this class
+#' requires exact naming conventions for element names of the components.
+#' The phyloseq package does not provide any validity checks that a given phylo
+#' instance is valid (conforms to the conventions in the ape package). Yet.
+#' If problems arise, this might be considered, and they could be defined
+#' judiciously and within phyloseq.
+#' Similarly, if a formal definition for the the phylo-class is ever exported
+#' by ape, the current philosophy of phyloseq would be to remove this
+#' internal definition and import the former. Note that there is still some
+#' work going on for the phylobase package, which is addressing these same
+#' exact issues for S4 phylogenetic tree interaction.
+#' A very large number of packages (around 60 at my last count), depend on ape,
+#' making it easily the de facto standard for representing phylogenetic trees in R;
+#' and the phyloseq team would prefer to use any exported definitions from
+#' the ape package if possible and available.
+#'
+#' @seealso
+#' \code{\link[ape]{phylo}}
+#'
+#' @keywords internal
+phylo <- structure(list(), class = "phylo")
+################################################################################
+# If this ever works
+# @importClassesFrom ape phylo
+################################################################################
+#' An S4 placeholder of the main phylogenetic tree class from the ape package.
+#'
+#' See the \code{\link[ape]{ape}} package for details about this type of
+#' representation of a phylogenetic tree.
+#' It is used throughout the ape package.
+#'
+#' @seealso \code{\link[ape]{phylo}}, \code{\link{setOldClass}}
+#'
+#' @name phylo-class
+#' @rdname phylo-class
+#' @exportClass phylo
+setOldClass("phylo")
+################################################################################
+#' An S4 placeholder for the \code{\link[stats]{dist}} class.
+#'
+#' See \code{\link[stats]{dist}} for details
+#' about this type of a distance matrix object.
+#'
+#' @seealso \code{\link[stats]{dist}}, \code{\link{setOldClass}}
+#'
+#' @name dist-class
+#' @rdname dist-class
+#' @exportClass dist
+setOldClass("dist")
+################################################################################
+# Use setClassUnion to define the unholy NULL-data union as a virtual class.
+# This is a way of dealing with the expected scenarios in which one or more of
+# the component data classes is not available, in which case NULL will be used
+# instead.
+################################################################################
+#' @keywords internal
+setClassUnion("otu_tableOrNULL", c("otu_table", "NULL"))
+#' @keywords internal
+setClassUnion("sample_dataOrNULL", c("sample_data", "NULL"))
+#' @keywords internal
+setClassUnion("taxonomyTableOrNULL", c("taxonomyTable", "NULL"))
+#' @keywords internal
+setClassUnion("phyloOrNULL", c("phylo", "NULL"))
+#' @importClassesFrom Biostrings BStringSet
+#' @importClassesFrom Biostrings DNAStringSet
+#' @importClassesFrom Biostrings RNAStringSet
+#' @importClassesFrom Biostrings AAStringSet
+#' @importClassesFrom Biostrings QualityScaledXStringSet
+#' @importClassesFrom Biostrings XStringQuality
+#' @importClassesFrom Biostrings PhredQuality
+#' @importClassesFrom Biostrings SolexaQuality
+#' @importClassesFrom Biostrings IlluminaQuality
+#' @importClassesFrom Biostrings QualityScaledBStringSet
+#' @importClassesFrom Biostrings QualityScaledDNAStringSet
+#' @importClassesFrom Biostrings QualityScaledRNAStringSet
+#' @importClassesFrom Biostrings QualityScaledAAStringSet
+#' @importClassesFrom Biostrings XStringSet
+#' @keywords internal
+setClassUnion("XStringSetOrNULL", c("XStringSet", "NULL"))
+################################################################################
+#' The main experiment-level class for phyloseq data
+#'
+#' Contains all currently-supported component data classes:
+#' \code{\link{otu_table-class}},
+#' \code{\link{sample_data-class}},
+#' \code{\link{taxonomyTable-class}} (\code{"tax_table"} slot),
+#' \code{\link[ape]{phylo}}-class (\code{"phy_tree"} slot),
+#' and the \code{\link[Biostrings]{XStringSet-class}} (\code{"refseq"} slot).
+#' There are several advantages
+#' to storing your phylogenetic sequencing experiment as an instance of the
+#' phyloseq class, not the least of which is that it is easy to return to the
+#' data later and feel confident that the different data types ``belong'' to
+#' one another. Furthermore, the \code{\link{phyloseq}} constructor ensures that
+#' the different data components have compatible indices (e.g. OTUs and samples),
+#' and performs the necessary trimming automatically when you create your
+#' ``experiment-level'' object. Downstream analyses are aware of which data
+#' classes they require -- and where to find them -- often making your
+#' \code{phyloseq-class} object the only data argument required for analysis and plotting
+#' functions (although there are many options and parameter arguments available
+#' to you).
+#'
+#' In the case of missing component data, the slots are set to \code{NULL}. As
+#' soon as a \code{phyloseq-class} object is to be updated with new component
+#' data (previously missing/\code{NULL} or not), the indices of all components
+#' are re-checked for compatibility and trimmed if necessary. This is to ensure
+#' by design that components describe the same taxa/samples, and also that these
+#' trimming/validity checks do not need to be repeated in downstream analyses.
+#'
+#' slots:
+#' \describe{
+#' \item{otu_table}{a single object of class otu_table.}
+#' \item{sam_data}{ a single object of class sample_data.}
+#' \item{tax_table}{ a single object of class taxonomyTable.}
+#' \item{phy_tree}{ a single object of the \code{\link[ape]{phylo}}-class, from the ape package.}
+#' \item{refseq}{ a biological sequence set object of a class that
+#' inherits from the \code{\link[Biostrings]{XStringSet-class}}, from the Biostrings package.}
+#' }
+#' @seealso
+#' The constructor, \code{\link{phyloseq}},
+#' the merger \code{\link{merge_phyloseq}}, and also the component
+#' constructor/accessors \code{\link{otu_table}}, \code{\link{sample_data}},
+#' \code{\link{tax_table}}, \code{\link{phy_tree}}, and \code{\link{refseq}}.
+#'
+#' @import BiocGenerics
+#' @importClassesFrom Biostrings XStringSet
+#' @name phyloseq-class
+#' @rdname phyloseq-class
+#' @exportClass phyloseq
+setClass(Class="phyloseq",
+ representation=representation(
+ otu_table="otu_tableOrNULL",
+ tax_table="taxonomyTableOrNULL",
+ sam_data="sample_dataOrNULL",
+ phy_tree="phyloOrNULL",
+ refseq = "XStringSetOrNULL"),
+ prototype=prototype(otu_table=NULL, tax_table=NULL, sam_data=NULL, phy_tree=NULL, refseq=NULL)
+)
+################################################################################
diff --git a/R/allData.R b/R/allData.R
new file mode 100644
index 0000000..d8bd2d3
--- /dev/null
+++ b/R/allData.R
@@ -0,0 +1,213 @@
+################################################################################
+#' (Data) Small example dataset from a human esophageal community (2004)
+#'
+#' Includes just 3 samples, 1 each from 3 subjects. Although the research article mentions 4 subjects,
+#' only 3 are included in this dataset.
+#'
+#' abstract from research article (quoted):
+#'
+#' The esophagus, like other luminal organs of the digestive system, provides a potential environment for bacterial colonization, but little is known about the presence of a bacterial biota or its nature. By using broad-range 16S rDNA PCR, biopsies were examined from the normal esophagus of four human adults. The 900 PCR products cloned represented 833 unique sequences belonging to 41 genera, or 95 species-level operational taxonomic units (SLOTU); 59 SLOTU were homologous with culture-d [...]
+#'
+#' (end quote)
+#'
+#' A description of the 16S rRNA sequence processing can be found on the mothur-wiki
+#' at the link below. A cutoff of 0.10 was used for OTU clustering in that example,
+#' and it is taken here as well to create example data, \code{esophagus}, which was
+#' easily imported with the \code{import_mothur()} function.
+#'
+#' @references
+#' Pei, Z., Bini, E. J., Yang, L., Zhou, M., Francois, F., & Blaser, M. J. (2004).
+#' Bacterial biota in the human distal esophagus.
+#' Proceedings of the National Academy of Sciences of the United States of America, 101(12), 4250-4255.
+#' \url{http://www.ncbi.nlm.nih.gov/pmc/articles/PMC384727}
+#'
+#' mothur-processed files and the sequence data can be downloaded from a zip-file,
+#' along with additional description, from the following URL:
+#' \url{http://www.mothur.org/wiki/Esophageal_community_analysis}
+#'
+#' @name data-esophagus
+#' @aliases esophagus
+#' @docType data
+#' @author Pei et al. \email{zhiheng.pei@@med.nyu.edu}
+#' @keywords data
+#' @examples
+#' data(esophagus)
+#' UniFrac(esophagus, weighted=TRUE)
+#' # How to re-create the esophagus dataset using import_mothur function
+#' mothlist <- system.file("extdata", "esophagus.fn.list.gz", package="phyloseq")
+#' mothgroup <- system.file("extdata", "esophagus.good.groups.gz", package="phyloseq")
+#' mothtree <- system.file("extdata", "esophagus.tree.gz", package="phyloseq")
+#' show_mothur_cutoffs(mothlist)
+#' cutoff <- "0.10"
+#' esophman <- import_mothur(mothlist, mothgroup, mothtree, cutoff)
+################################################################################
+NA
+################################################################################
+#' (Data) Enterotypes of the human gut microbiome (2011)
+#'
+#' Published in Nature in early 2011, this work compared (among other things),
+#' the faecal microbial communities from 22
+#' subjects using complete shotgun DNA sequencing.
+#' Authors further compared these microbial communities with the faecal
+#' communities of subjects from other studies. A total of 280 faecal samples / subjects
+#' are represented in this dataset, and 553 genera. The authors claim that the
+#' data naturally clumps into three community-level clusters, or ``enterotypes'',
+#' that are not immediately explained by sequencing technology or demographic
+#' features of the subjects, but with potential relevance to understanding
+#' human gut microbiota.
+#'
+#' abstract from research article (quoted):
+#'
+#' Our knowledge of species and functional composition of the human gut microbiome is rapidly increasing, but it is still based on very few cohorts and little is known about variation across the world. By combining 22 newly sequenced faecal metagenomes of individuals from four countries with previously published data sets, here we identify three robust clusters (referred to as enterotypes hereafter) that are not nation or continent specific. We also confirmed the enterotypes in two publi [...]
+#'
+#' (end quote)
+#'
+#' @references
+#' Arumugam, M., et al. (2011). Enterotypes of the human gut microbiome.
+#'
+#' Nature, 473(7346), 174-180.
+#'
+#' \url{http://www.nature.com/doifinder/10.1038/nature09944}
+#' See supplemental information for subject data.
+#'
+#' OTU-clustered data was downloaded from the publicly-accessible:
+#'
+#' \url{http://www.bork.embl.de/Docu/Arumugam_et_al_2011/downloads.html}
+#'
+#' @name data-enterotype
+#' @aliases enterotype
+#' @docType data
+#' @author Arumugam, M., Raes, J., et al.
+#' @keywords data
+#' @examples
+#' data(enterotype)
+#' ig <- make_network(enterotype, "samples", max.dist=0.3)
+#' plot_network(ig, enterotype, color="SeqTech", shape="Enterotype", line_weight=0.3, label=NULL)
+################################################################################
+NA
+################################################################################
+#' (Data) Reproducibility of soil microbiome data (2011)
+#'
+#' Published in early 2011,
+#' this work compared 24 separate soil microbial communities under four treatment
+#' conditions via multiplexed/barcoded 454-pyrosequencing of PCR-amplified 16S rRNA gene fragments.
+#' The authors found differences in the composition and structure of microbial
+#' communities between soil treatments.
+#' As expected, the soil microbial communities were highly diverse, with a staggering
+#' 16,825 different OTUs (species) observed in the included dataset.
+#' Interestingly, this study used a larger number of replicates than previous studies of this type,
+#' for a total of 56 samples, and the putatively low resampling rate of species
+#' between replicated sequencing trials (``OTU overlap'') was a major concern by
+#' the authors.
+#'
+#' This dataset contains an experiment-level (\code{\link{phyloseq-class}}) object,
+#' which in turn contains the taxa-contingency table and soil-treatment table
+#' as \code{\link{otu_table-class}} and \code{\link{sample_data-class}} components, respectively.
+#'
+#' This data was
+#' imported from raw files supplied directly by the authors via personal communication
+#' for the purposes of including as an example in the \code{\link{phyloseq-package}}.
+#' As this data is sensitive to choices in OTU-clustering parameters, attempts to recreate
+#' the \code{otu_table} from the raw sequencing data may give slightly different results
+#' than the table provided here.
+#'
+#' abstract from research article (quoted):
+#'
+#' To determine the reproducibility and quantitation of the amplicon sequencing-based
+#' detection approach for analyzing microbial community structure, a total of 24 microbial
+#' communities from a long-term global change experimental site were examined. Genomic DNA
+#' obtained from each community was used to amplify 16S rRNA genes with two or three
+#' barcode tags as technical replicates in the presence of a small quantity (0.1\% wt/wt)
+#' of genomic DNA from Shewanella oneidensis MR-1 as the control. The technical
+#' reproducibility of the amplicon sequencing-based detection approach is quite low,
+#' with an average operational taxonomic unit (OTU) overlap of 17.2\%\code{+/-}2.3\%
+#' between two technical replicates, and 8.2\%\code{+/-}2.3\% among three technical
+#' replicates, which is most likely due to problems associated with random sampling processes.
+#' Such variations in technical replicates could have substantial effects on estimating
+#' beta-diversity but less on alpha-diversity. A high variation was also observed in the
+#' control across different samples (for example, 66.7-fold for the forward primer),
+#' suggesting that the amplicon sequencing-based detection approach could not be quantitative.
+#' In addition, various strategies were examined to improve the comparability of amplicon
+#' sequencing data, such as increasing biological replicates, and removing singleton sequences
+#' and less-representative OTUs across biological replicates. Finally, as expected, various
+#' statistical analyses with preprocessed experimental data revealed clear differences in
+#' the composition and structure of microbial communities between warming and non-warming,
+#' or between clipping and non-clipping. Taken together, these results suggest that amplicon
+#' sequencing-based detection is useful in analyzing microbial community structure even
+#' though it is not reproducible and quantitative. However, great caution should be taken
+#' in experimental design and data interpretation when the amplicon sequencing-based detection
+#' approach is used for quantitative analysis of the beta-diversity of microbial communities.
+#'
+#' (end quote)
+#'
+#' @references Zhou, J., Wu, L., Deng, Y., Zhi, X., Jiang, Y.-H., Tu, Q., Xie, J., et al.
+#' Reproducibility and quantitation of amplicon sequencing-based detection.
+#' The ISME Journal. (2011) 5(8):1303-1313. \code{doi:10.1038/ismej.2011.11}
+#'
+#' The article can be accessed online at \url{http://www.nature.com/ismej/journal/v5/n8/full/ismej201111a.html}
+#'
+#' @name data-soilrep
+#' @aliases soilrep
+#' @docType data
+#' @author Jizhong Zhou, et al.
+#' @keywords data
+#' @examples
+#' # Load the data
+#' data(soilrep)
+#' ################################################################################
+#' # Alpha diversity (richness) example. Accept null hypothesis:
+#' # No convincing difference in species richness between warmed/unwarmed soils.
+#' ################################################################################
+#' # Graphically compare richness between the different treatments.
+#' man.col <- c(WC="red", WU="brown", UC="blue", UU="darkgreen")
+#' plot_richness(soilrep, x="Treatment", color="Treatment", measures=c("Observed", "Chao1", "Shannon"))
+################################################################################
+NA
+################################################################################
+################################################################################
+#' (Data) Global patterns of 16S rRNA diversity at a depth of millions of sequences per sample (2011)
+#'
+#' Published in PNAS in early 2011. This work compared the microbial
+#' communities from 25 environmental samples and three known ``mock communities''
+#' -- a total of 9 sample types -- at a depth averaging 3.1 million reads per sample.
+#' Authors were able to reproduce diversity patterns seen in many other
+#' published studies, while also invesitigating technical issues/bias by
+#' applying the same techniques to simulated microbial communities of known
+#' composition.
+#'
+#' abstract from research article (quoted):
+#'
+#' The ongoing revolution in high-throughput sequencing continues to democratize the ability of small groups of investigators to map the microbial component of the biosphere. In particular, the coevolution of new sequencing platforms and new software tools allows data acquisition and analysis on an unprecedented scale. Here we report the next stage in this coevolutionary arms race, using the Illumina GAIIx platform to sequence a diverse array of 25 environmental samples and three known ` [...]
+#'
+#' (end quote)
+#'
+#' Many thanks to J. Gregory Caporaso for directly providing the OTU-clustered data files
+#' for inclusion in this package.
+#'
+#' @references
+#' Caporaso, J. G., et al. (2011).
+#' Global patterns of 16S rRNA diversity at a depth of millions of sequences per sample.
+#' PNAS, 108, 4516-4522.
+#' PMCID: PMC3063599
+#'
+#' The primary article can be viewed/downloaded at:
+#' \url{http://www.pnas.org/content/108/suppl.1/4516.short}
+#'
+#' @name data-GlobalPatterns
+#' @aliases GlobalPatterns
+#' @docType data
+#' @author Caporaso, J. G., et al.
+#' @keywords data
+#'
+#' @seealso
+#' The examples on the phyloseq wiki page for \code{\link{plot_ordination}} show
+#' many more examples:
+#'
+#' \url{https://github.com/joey711/phyloseq/wiki/plot_ordination}
+#'
+#' @examples
+#' data(GlobalPatterns)
+#' plot_richness(GlobalPatterns, x="SampleType", measures=c("Observed", "Chao1", "Shannon"))
+################################################################################
+NA
+################################################################################
diff --git a/R/allPackage.R b/R/allPackage.R
new file mode 100644
index 0000000..9945db7
--- /dev/null
+++ b/R/allPackage.R
@@ -0,0 +1,24 @@
+###############################################
+#' Handling and analysis of high-throughput phylogenetic sequence data.
+#'
+#' There are already several ecology and phylogenetic packages available in R,
+#' including the adephylo, vegan, ade4, picante, ape, phangorn, phylobase, and OTUbase packages.
+#' These can already take advantage of many of the powerful statistical and graphics tools
+#' available in R. However, prior to \emph{phyloseq} a user must devise their own methods
+#' for parsing the output of their favorite OTU clustering application, and, as a consequence,
+#' there is also no standard within Bioconductor (or R generally) for storing or sharing the
+#' suite of related data objects that describe a phylogenetic sequencing project.
+#' The phyloseq package seeks to address these issues by providing a related set of S4 classes
+#' that internally manage the handling tasks associated with organizing, linking, storing,
+#' and analyzing phylogenetic sequencing data. \emph{phyloseq} additionally provides some
+#' convenience wrappers for input from common clustering applications, common analysis pipelines,
+#' and native implementation of methods that are not available in other R packages.
+#'
+#' @import methods
+#' @name phyloseq-package
+#' @author Paul J. McMurdie II \email{mcmurdie@@stanford.edu}
+#' @references \url{www.stanford.edu/~mcmurdie}
+#' @docType package
+#' @keywords package
+NA
+###############################################
diff --git a/R/almostAllAccessors.R b/R/almostAllAccessors.R
new file mode 100644
index 0000000..76c9709
--- /dev/null
+++ b/R/almostAllAccessors.R
@@ -0,0 +1,554 @@
+################################################################################
+### Accessor / subset methods.
+################################################################################
+################################################################################
+#' Retrieve reference sequences (\code{\link[Biostrings]{XStringSet}}-class) from object.
+#'
+#' This is the suggested method
+#' for accessing
+#' the phylogenetic tree, (\code{\link[Biostrings]{XStringSet}}-class)
+#' from a phyloseq data object (\code{\link{phyloseq-class}}).
+#' Like other accessors (see See Also, below), the default behavior of this method
+#' is to stop with an
+#' error if \code{physeq} is a \code{phyloseq-class} but does not
+#' contain reference sequences (the component data type you are trying to access in this case).
+#'
+#' @usage refseq(physeq, errorIfNULL=TRUE)
+#'
+#' @param physeq (Required). An instance of phyloseq-class
+#' that contains a phylogenetic tree. If physeq is a phylogenetic
+#' tree (a component data class), then it is returned as-is.
+#'
+#' @param errorIfNULL (Optional). Logical. Should the accessor stop with
+#' an error if the slot is empty (\code{NULL})? Default \code{TRUE}.
+#'
+#' @return The \code{\link[ape]{phylo}}-class object contained within \code{physeq};
+#' or NULL if \code{physeq} does not have a tree.
+#' This method stops with an error in the latter NULL case be default, which
+#' can be over-ridden by changing the value of \code{errorIfNULL} to \code{FALSE}.
+#'
+#' @seealso \code{\link{otu_table}}, \code{\link{sample_data}}, \code{\link{tax_table}}
+#' \code{\link{phy_tree}},
+#' \code{\link{phyloseq}}, \code{\link{merge_phyloseq}}
+#'
+#' @export
+#' @rdname refseq-methods
+#' @docType methods
+#'
+#' @examples
+#' data(GlobalPatterns)
+#' refseq(GlobalPatterns, FALSE)
+setGeneric("refseq", function(physeq, errorIfNULL=TRUE) standardGeneric("refseq"))
+#' @rdname refseq-methods
+#' @aliases refseq,ANY-method
+setMethod("refseq", "ANY", function(physeq, errorIfNULL=TRUE){
+ access(physeq, "refseq", errorIfNULL)
+})
+# Return as-is if already a "XStringSet" object
+#' @importClassesFrom Biostrings XStringSet
+#' @rdname refseq-methods
+#' @aliases refseq,XStringSet-method
+setMethod("refseq", "XStringSet", function(physeq){ return(physeq) })
+################################################################################
+#' Retrieve phylogenetic tree (\code{\link[ape]{phylo}}-class) from object.
+#'
+#' This is the suggested method
+#' for accessing
+#' the phylogenetic tree, (\code{\link[ape]{phylo}}-class) from a \code{\link{phyloseq-class}}.
+#' Like other accessors (see See Also, below), the default behavior of this method
+#' is to stop with an
+#' error if \code{physeq} is a \code{phyloseq-class} but does not
+#' contain a phylogenetic tree (the component data you are trying to access in this case).
+#'
+#' Note that the tip labels should be named to match the
+#' \code{taxa_names} of the other objects to which it is going to be paired.
+#' The \code{\link{phyloseq}} constructor automatically checks for
+#' exact agreement in the set of species described by the phlyogenetic tree
+#' and the other components (taxonomyTable, otu_table),
+#' and trims as-needed. Thus, the tip.labels in a phylo object
+#' must be named to match the results of
+#' \code{\link{taxa_names}} of the other objects to which it will ultimately be paired.
+#'
+#' @usage phy_tree(physeq, errorIfNULL=TRUE)
+#'
+#' @param physeq (Required). An instance of phyloseq-class
+#' that contains a phylogenetic tree. If physeq is a phylogenetic
+#' tree (a component data class), then it is returned as-is.
+#'
+#' @param errorIfNULL (Optional). Logical. Should the accessor stop with
+#' an error if the slot is empty (\code{NULL})? Default \code{TRUE}.
+#'
+#' @return The \code{\link[ape]{phylo}}-class object contained within \code{physeq};
+#' or NULL if \code{physeq} does not have a tree.
+#' This method stops with an error in the latter NULL case be default, which
+#' can be over-ridden by changing the value of \code{errorIfNULL} to \code{FALSE}.
+#'
+#' @seealso \code{\link{otu_table}}, \code{\link{sample_data}}, \code{\link{tax_table}}
+#' \code{\link{refseq}},
+#' \code{\link{phyloseq}}, \code{\link{merge_phyloseq}}
+#'
+#' @export
+#' @rdname phy_tree-methods
+#' @docType methods
+#'
+#' @examples
+#' data(GlobalPatterns)
+#' phy_tree(GlobalPatterns)
+setGeneric("phy_tree", function(physeq, errorIfNULL=TRUE) standardGeneric("phy_tree"))
+#' @rdname phy_tree-methods
+#' @aliases phy_tree,ANY-method
+setMethod("phy_tree", "ANY", function(physeq, errorIfNULL=TRUE){
+ access(physeq, "phy_tree", errorIfNULL)
+})
+# Return as-is if already a "phylo" object
+#' @rdname phy_tree-methods
+#' @aliases phy_tree,phylo-method
+setMethod("phy_tree", "phylo", function(physeq){ return(physeq) })
+################################################################################
+#' Access taxa_are_rows slot from otu_table objects.
+#'
+#' @usage taxa_are_rows(physeq)
+#'
+#' @param physeq (Required). \code{\link{phyloseq-class}}, or \code{\link{otu_table-class}}.
+#'
+#' @return A logical indicating the orientation of the otu_table.
+#'
+#' @seealso \code{\link{otu_table}}
+#' @rdname taxa_are_rows-methods
+#' @docType methods
+#' @export
+#' @aliases taxa_are_rows taxa_are_rows
+setGeneric("taxa_are_rows", function(physeq) standardGeneric("taxa_are_rows"))
+#' @rdname taxa_are_rows-methods
+#' @aliases taxa_are_rows,ANY-method
+setMethod("taxa_are_rows", "ANY", function(physeq){NULL})
+#' @rdname taxa_are_rows-methods
+#' @aliases taxa_are_rows,otu_table-method
+setMethod("taxa_are_rows", "otu_table", function(physeq){physeq at taxa_are_rows})
+#' @rdname taxa_are_rows-methods
+#' @aliases taxa_are_rows,phyloseq-method
+setMethod("taxa_are_rows", "phyloseq", function(physeq){
+ taxa_are_rows(otu_table(physeq))
+})
+################################################################################
+#' Get the number of taxa/species.
+#'
+#' @usage ntaxa(physeq)
+#'
+#' @param physeq \code{\link{phyloseq-class}}, \code{\link{otu_table-class}},
+#' \code{\link{taxonomyTable-class}}, or
+#' \code{\link[ape]{phylo}}
+#'
+#' @return An integer indicating the number of taxa / species.
+#'
+#' @seealso taxa_names
+#'
+#' @rdname ntaxa-methods
+#' @docType methods
+#' @export
+#'
+#' @examples
+#' data("esophagus")
+#' ntaxa(esophagus)
+#' phy_tree(esophagus)
+#' ntaxa(phy_tree(esophagus))
+setGeneric("ntaxa", function(physeq) standardGeneric("ntaxa"))
+#' @rdname ntaxa-methods
+#' @aliases ntaxa,ANY-method
+setMethod("ntaxa", "ANY", function(physeq){ return(NULL) })
+#' @rdname ntaxa-methods
+#' @aliases ntaxa,phyloseq-method
+setMethod("ntaxa", "phyloseq", function(physeq){
+ ntaxa(otu_table(physeq))
+})
+#' @rdname ntaxa-methods
+#' @aliases ntaxa,otu_table-method
+setMethod("ntaxa", "otu_table", function(physeq){
+ if( taxa_are_rows(physeq) ){
+ return( nrow(physeq) )
+ } else {
+ return( ncol(physeq) )
+ }
+})
+#' @rdname ntaxa-methods
+#' @aliases ntaxa,taxonomyTable-method
+setMethod("ntaxa", "taxonomyTable", function(physeq){
+ nrow(physeq)
+})
+#' @rdname ntaxa-methods
+#' @aliases ntaxa,phylo-method
+setMethod("ntaxa", "phylo", function(physeq){
+ length(physeq$tip.label)
+})
+#' @rdname ntaxa-methods
+#' @aliases ntaxa,XStringSet-method
+setMethod("ntaxa", "XStringSet", function(physeq){
+ length(physeq)
+})
+################################################################################
+#' Get species / taxa names.
+#'
+#' @usage taxa_names(physeq)
+#'
+#' @param physeq \code{\link{phyloseq-class}}, \code{\link{otu_table-class}},
+#' \code{\link{taxonomyTable-class}}, or
+#' \code{\link[ape]{phylo}}
+#'
+#' @return A character vector of the names of the species in \code{physeq}.
+#'
+#' @seealso ntaxa
+#'
+#' @rdname taxa_names-methods
+#' @docType methods
+#' @export
+#'
+#' @examples #
+#' data("esophagus")
+#' tree <- phy_tree(esophagus)
+#' OTU1 <- otu_table(esophagus)
+#' taxa_names(tree)
+#' taxa_names(OTU1)
+#' physeq1 <- phyloseq(OTU1, tree)
+#' taxa_names(physeq1)
+setGeneric("taxa_names", function(physeq) standardGeneric("taxa_names"))
+#' @rdname taxa_names-methods
+#' @aliases taxa_names,ANY-method
+setMethod("taxa_names", "ANY", function(physeq){ return(NULL) })
+#' @rdname taxa_names-methods
+#' @aliases taxa_names,phyloseq-method
+setMethod("taxa_names", "phyloseq", function(physeq){
+ taxa_names(otu_table(physeq))
+})
+#' @rdname taxa_names-methods
+#' @aliases taxa_names,otu_table-method
+setMethod("taxa_names", "otu_table", function(physeq){
+ if( taxa_are_rows(physeq) ){
+ return( rownames(physeq) )
+ } else {
+ return( colnames(physeq) )
+ }
+})
+#' @rdname taxa_names-methods
+#' @aliases taxa_names,taxonomyTable-method
+setMethod("taxa_names", "taxonomyTable", function(physeq) rownames(physeq) )
+#' @rdname taxa_names-methods
+#' @aliases taxa_names,sample_data-method
+setMethod("taxa_names", "sample_data", function(physeq) NULL )
+#' @rdname taxa_names-methods
+#' @aliases taxa_names,phylo-method
+setMethod("taxa_names", "phylo", function(physeq) physeq$tip.label )
+#' @rdname taxa_names-methods
+#' @aliases taxa_names,XStringSet-method
+setMethod("taxa_names", "XStringSet", function(physeq) names(physeq) )
+################################################################################
+#' Get the number of samples.
+#'
+#' @usage nsamples(physeq)
+#'
+#' @param physeq A \code{\link{phyloseq-class}}, \code{\link{sample_data}},
+#' or \code{\link{otu_table-class}}.
+#'
+#' @return An integer indicating the total number of samples.
+#'
+#' @seealso \code{\link{taxa_names}}, \code{\link{sample_names}},
+#' \code{\link{ntaxa}}
+#'
+#' @rdname nsamples-methods
+#' @docType methods
+#' @export
+#'
+#' @examples #
+#' data("esophagus")
+#' tree <- phy_tree(esophagus)
+#' OTU1 <- otu_table(esophagus)
+#' nsamples(OTU1)
+#' physeq1 <- phyloseq(OTU1, tree)
+#' nsamples(physeq1)
+setGeneric("nsamples", function(physeq) standardGeneric("nsamples"))
+#' @rdname nsamples-methods
+#' @aliases nsamples,ANY-method
+setMethod("nsamples", "ANY", function(physeq){ return(NULL) })
+#' @rdname nsamples-methods
+#' @aliases nsamples,phyloseq-method
+setMethod("nsamples", "phyloseq", function(physeq){
+ # dispatch to core, required component, otu_table
+ nsamples(otu_table(physeq))
+})
+#' @rdname nsamples-methods
+#' @aliases nsamples,otu_table-method
+setMethod("nsamples", "otu_table", function(physeq){
+ if( taxa_are_rows(physeq) ){
+ return( ncol(physeq) )
+ } else {
+ return( nrow(physeq) )
+ }
+})
+#' @rdname nsamples-methods
+#' @aliases nsamples,sample_data-method
+setMethod("nsamples", "sample_data", function(physeq) nrow(physeq) )
+################################################################################
+#' Get sample names.
+#'
+#' @usage sample_names(physeq)
+#'
+#' @param physeq (Required). A \code{\link{phyloseq-class}}, \code{\link{sample_data}},
+#' or \code{\link{otu_table-class}}.
+#'
+#' @return A character vector. The names of the samples in \code{physeq}.
+#'
+#' @seealso \code{\link{taxa_names}}, \code{\link{nsamples}}
+#'
+#' @aliases sample_names
+#'
+#' @rdname sample_names-methods
+#' @docType methods
+#' @export
+#'
+#' @examples
+#' data(esophagus)
+#' sample_names(esophagus)
+setGeneric("sample_names", function(physeq) standardGeneric("sample_names"))
+# Unless otherwise specified, this should return a value of NULL
+# That way, objects that do not explicitly describe samples all
+# behave in the same (returning NULL) way.
+#' @rdname sample_names-methods
+#' @aliases sample_names,ANY-method
+setMethod("sample_names", "ANY", function(physeq){ return(NULL) })
+#' @rdname sample_names-methods
+#' @aliases sample_names,phyloseq-method
+setMethod("sample_names", "phyloseq", function(physeq){
+ # dispatch to core, required component, otu_table
+ sample_names(otu_table(physeq))
+})
+#' @rdname sample_names-methods
+#' @aliases sample_names,sample_data-method
+setMethod("sample_names", "sample_data", function(physeq) rownames(physeq) )
+#' @rdname sample_names-methods
+#' @aliases sample_names,otu_table-method
+setMethod("sample_names", "otu_table", function(physeq){
+ if( taxa_are_rows(physeq) ){
+ return( colnames(physeq) )
+ } else {
+ return( rownames(physeq) )
+ }
+})
+################################################################################
+#' Returns all abundance values for species \code{i}.
+#'
+#' This is a simple accessor function for investigating
+#' a single species-of-interest.
+#'
+#' @usage get_sample(physeq, i)
+#' @param physeq (Required). \code{\link{otu_table-class}}, or \code{\link{phyloseq-class}}.
+#' @param i (Required). A single taxa/species/OTU ID for which you want
+#' to know the abundance in each sample.
+#'
+#' @return An integer vector of the abundance values for
+#' each sample in \code{physeq} for species \code{i}
+#'
+#' @seealso
+#' \code{\link{get_taxa}}
+#' \code{\link{taxa_names}}
+#' \code{\link{sample_names}}
+#'
+#' @rdname get_sample-methods
+#' @docType methods
+#' @export
+#'
+#' @examples
+#' data(esophagus)
+#' taxa_names(esophagus)
+#' get_sample(esophagus, "59_5_19")
+setGeneric("get_sample", function(physeq, i) standardGeneric("get_sample"))
+################################################################################
+#' @aliases get_sample,otu_table-method
+#' @rdname get_sample-methods
+setMethod("get_sample", "otu_table", function(physeq, i){
+ if( taxa_are_rows(physeq) ){
+ as(physeq, "matrix")[i, ]
+ } else {
+ as(physeq, "matrix")[, i]
+ }
+})
+################################################################################
+#' @aliases get_sample,phyloseq-method
+#' @rdname get_sample-methods
+setMethod("get_sample", "phyloseq", function(physeq, i){
+ get_sample(otu_table(physeq), i)
+})
+################################################################################
+#' Returns all abundance values of sample \code{i}.
+#'
+#' This is a simple accessor function for investigating
+#' a single sample-of-interest.
+#'
+#' @usage get_taxa(physeq, i)
+#' @param physeq (Required). \code{\link{otu_table-class}}, or \code{\link{phyloseq-class}}.
+#' @param i (Required). A single sample for which you want
+#' to know the abundance of each species. Can be integer
+#' for index value, or sample name.
+#'
+#' @return An integer vector of the abundance values for
+#' each species in \code{physeq} for sample \code{i}
+#'
+#' @seealso
+#' \code{\link{get_sample}}
+#' \code{\link{taxa_names}}
+#' \code{\link{sample_names}}
+#'
+#' @rdname get_taxa-methods
+#' @docType methods
+#' @export
+#'
+#' @examples
+#' data(esophagus)
+#' sample_names(esophagus)
+#' get_taxa(esophagus, "B")
+setGeneric("get_taxa", function(physeq, i) standardGeneric("get_taxa"))
+#' @aliases get_taxa,otu_table-method
+#' @rdname get_taxa-methods
+setMethod("get_taxa", "otu_table", function(physeq, i){
+ if( taxa_are_rows(physeq) ){
+ as(physeq, "matrix")[, i]
+ } else {
+ as(physeq, "matrix")[i, ]
+ }
+})
+#' @aliases get_taxa,phyloseq-method
+#' @rdname get_taxa-methods
+setMethod("get_taxa", "phyloseq", function(physeq, i){
+ get_taxa(otu_table(physeq), i)
+})
+################################################################################
+#' Retrieve the names of the taxonomic ranks
+#'
+#' This is a simple accessor function to make it more convenient to determine
+#' the taxonomic ranks that are available in a given \code{\link{phyloseq-class}}
+#' object.
+#'
+#' @usage rank_names(physeq, errorIfNULL=TRUE)
+#'
+#' @param physeq (Required).
+#' \code{\link{taxonomyTable-class}}, or \code{\link{phyloseq-class}}.
+#'
+#' @param errorIfNULL (Optional). Logical. Should the accessor stop with
+#' an error if the slot is empty (\code{NULL})? Default \code{TRUE}.
+#'
+#' @return Character vector. The names of the available taxonomic ranks.
+#'
+#' @seealso
+#' \code{\link{get_taxa}}
+#' \code{\link{taxa_names}}
+#' \code{\link{sample_names}}
+#'
+#' @export
+#'
+#' @examples
+#' data(enterotype)
+#' rank_names(enterotype)
+rank_names <- function(physeq, errorIfNULL=TRUE){
+ colnames(tax_table(physeq, errorIfNULL))
+}
+################################################################################
+#' Get a unique vector of the observed taxa at a particular taxonomic rank
+#'
+#' This is a simple accessor function to make it more convenient to determine
+#' the different taxa present for a particular taxonomic rank
+#' in a given \code{\link{phyloseq-class}} object.
+#'
+#' @usage get_taxa_unique(physeq, taxonomic.rank=rank_names(physeq)[1], errorIfNULL=TRUE)
+#'
+#' @param physeq (Required). \code{\link{taxonomyTable-class}}, or \code{\link{phyloseq-class}}.
+#'
+#' @param taxonomic.rank (Optional). Character. The taxonomic rank to use. Must select
+#' from the set indicated by \code{get_taxa_unique}. Default is
+#' to take the first column of the \code{taxonomyTable} component.
+#'
+#' @param errorIfNULL (Optional). Logical. Should the accessor stop with
+#' an error if the slot is empty (\code{NULL})? Default \code{TRUE}.
+#'
+#' @return Character vector. Unique vector of the observed taxa
+#' at a particular taxonomic rank
+#'
+#' @seealso
+#' \code{\link{get_taxa}}
+#' \code{\link{taxa_names}}
+#' \code{\link{sample_names}}
+#'
+#' @export
+#'
+#' @examples
+#' data(enterotype)
+#' get_taxa_unique(enterotype)
+#' data(GlobalPatterns)
+#' get_taxa_unique(GlobalPatterns, "Family")
+get_taxa_unique <- function(physeq, taxonomic.rank=rank_names(physeq)[1], errorIfNULL=TRUE){
+ unique(as(tax_table(physeq, errorIfNULL)[, taxonomic.rank], "character"))
+}
+################################################################################
+#' Get the sample variables present in sample_data
+#'
+#' This is a simple accessor function to make it more convenient to determine
+#' the sample variable names of a particular \code{\link{phyloseq-class}} object.
+#'
+#' @usage sample_variables(physeq, errorIfNULL=TRUE)
+#'
+#' @param physeq (Required). \code{\link{sample_data-class}}, or \code{\link{phyloseq-class}}.
+#'
+#' @param errorIfNULL (Optional). Logical. Should the accessor stop with
+#' an error if the slot is empty (\code{NULL})? Default \code{TRUE}.
+#'
+#' @return Character vector. The names of the variables in the sample_data
+#' data.frame. Essentially the column names. Useful for selecting model
+#' and graphics parameters that interact with sample_data.
+#'
+#' @seealso
+#' \code{\link{get_taxa}}
+#' \code{\link{taxa_names}}
+#' \code{\link{sample_names}}
+#'
+#' @export
+#'
+#' @examples
+#' data(enterotype)
+#' sample_variables(enterotype)
+sample_variables <- function(physeq, errorIfNULL=TRUE){
+ colnames(sample_data(physeq, errorIfNULL))
+}
+################################################################################
+#' Get the values for a particular variable in sample_data
+#'
+#' This is a simple accessor function for streamlining access
+#' to values/vectors/factors/etc contained in the sample_data.
+#'
+#' @usage get_variable(physeq, varName)
+#'
+#' @param physeq (Required). \code{\link{sample_data-class}}, or \code{\link{phyloseq-class}}.
+#'
+#' @param varName (Required). Character string of the variable name in \code{sample_data}.
+#' Use \code{sample_variables(physeq)} for available variables in your object.
+#'
+#' @return Data. The clas of the data depends on what the contents of sample_data.
+#'
+#' @seealso
+#' \code{\link{get_taxa}}
+#' \code{\link{taxa_names}}
+#' \code{\link{sample_names}}
+#'
+#' \code{\link{sample_variables}}
+#'
+#' @export
+#'
+#' @examples
+#' # Load the GlobalPatterns dataset into the workspace environment
+#' data(GlobalPatterns)
+#' # Look at the different values for SampleType
+#' get_variable(GlobalPatterns, "SampleType")
+get_variable <- function(physeq, varName){
+ if( is.null(sample_data(physeq, FALSE)) ){
+ stop("Your phyloseq data object does not have a sample-data component\n",
+ "Try ?sample_data for more details.")
+ }
+ return( as(sample_data(physeq), "data.frame")[, varName] )
+}
+################################################################################
diff --git a/R/as-methods.R b/R/as-methods.R
new file mode 100644
index 0000000..cb89511
--- /dev/null
+++ b/R/as-methods.R
@@ -0,0 +1,34 @@
+################################################################################
+# coercion methods
+################################################################################
+setAs("phyloseq", "matrix", function(from){
+ from at .Data
+})
+setAs("phyloseq", "otu_table", function(from){
+ otu_table(from)
+})
+setAs("phyloseq", "otu_table", function(from){
+ otu_table(from)
+})
+################################################################################
+setAs("data.frame", "sample_data", function(from){
+ new("sample_data", from)
+})
+setAs("sample_data", "data.frame", function(from){
+ data.frame(from)
+})
+setAs("phyloseq", "sample_data", function(from){
+ sample_data(from)
+})
+################################################################################
+setAs("taxonomyTable", "matrix", function(from){
+ from at .Data
+})
+setAs("phyloseq", "taxonomyTable", function(from){
+ tax_table(from)
+})
+################################################################################
+setAs("phyloseq", "phylo", function(from){
+ phy_tree(from)
+})
+################################################################################
diff --git a/R/assignment-methods.R b/R/assignment-methods.R
new file mode 100644
index 0000000..aa82c99
--- /dev/null
+++ b/R/assignment-methods.R
@@ -0,0 +1,375 @@
+################################################################################
+#' Assign a new OTU Table to \code{x}
+#'
+#' @usage otu_table(x) <- value
+#'
+#' @param x (Required). \code{\link{phyloseq-class}}
+#' @param value (Required).
+#' \code{\link{otu_table-class}}
+#' or
+#' \code{\link{phyloseq-class}}.
+#'
+#' @export
+#' @docType methods
+#' @rdname assign-otu_table
+#' @aliases assign-otu_table
+#'
+#' @examples
+#' # data(GlobalPatterns)
+#' # # An example of pruning to just the first 100 taxa in GlobalPatterns.
+#' # ex2a <- prune_taxa(taxa_names(GlobalPatterns)[1:100], GlobalPatterns)
+#' # # The following 3 lines produces an ex2b that is equal to ex2a
+#' # ex2b <- GlobalPatterns
+#' # OTU <- otu_table(GlobalPatterns)[1:100, ]
+#' # otu_table(ex2b) <- OTU
+#' # identical(ex2a, ex2b)
+#' # print(ex2b)
+#' # # Relace otu_table by implying the component in context.
+#' # ex2c <- GlobalPatterns
+#' # otu_table(ex2c) <- ex2b
+#' # identical(ex2a, ex2c)
+setGeneric("otu_table<-", function(x, value) standardGeneric("otu_table<-"))
+#' @rdname assign-otu_table
+#' @aliases otu_table<-,phyloseq,otu_table-method
+setMethod("otu_table<-", c("phyloseq", "otu_table"), function(x, value){
+ phyloseq(value, x at sam_data, x at tax_table, x at phy_tree, x at refseq)
+})
+#' @rdname assign-otu_table
+#' @aliases otu_table<-,otu_table,otu_table-method
+setMethod("otu_table<-", c("otu_table", "otu_table"), function(x, value){ value })
+#' @rdname assign-otu_table
+#' @aliases otu_table<-,phyloseq,phyloseq-method
+setMethod("otu_table<-", c("phyloseq", "phyloseq"), function(x, value){
+ phyloseq(otu_table(value), x at sam_data, x at tax_table, x at phy_tree, x at refseq)
+})
+################################################################################
+#' Manually change taxa_are_rows through assignment.
+#'
+#' The taxa_are_rows slot is a logical indicating the orientation of the
+#' abundance table contained in object \code{x}.
+#'
+#' @usage taxa_are_rows(x) <- value
+#'
+#' @param x \code{\link{otu_table-class}} or \code{\link{phyloseq-class}}
+#'
+#' @param value A logical of length equal to 1. If \code{length(value) > 1},
+#' the additional elements will be ignored. Only the first element is assigned
+#' to the taxa_are_rows slot.
+#'
+#' @export
+#' @docType methods
+#' @rdname assign-taxa_are_rows
+#' @aliases assign-taxa_are_rows taxa_are_rows<-
+#'
+#' @examples
+#' data(esophagus)
+#' taxa_are_rows(esophagus)
+#' taxa_are_rows(otu_table(esophagus))
+setGeneric("taxa_are_rows<-", function(x, value){
+ standardGeneric("taxa_are_rows<-")
+})
+#' @rdname assign-taxa_are_rows
+#' @aliases taxa_are_rows<-,otu_table,logical-method
+setMethod("taxa_are_rows<-", c("otu_table", "logical"), function(x, value){
+ x at taxa_are_rows <- value[1]
+ return(x)
+})
+#' @rdname assign-taxa_are_rows
+#' @aliases taxa_are_rows<-,phyloseq,logical-method
+setMethod("taxa_are_rows<-", c("phyloseq", "logical"), function(x, value){
+ taxa_are_rows(otu_table(x)) <- value
+ return(x)
+})
+################################################################################
+#' Assign (new) sample_data to \code{x}
+#'
+#' This replaces the current \code{sample_data} component of \code{x} with
+#' \code{value}, if \code{value} is a \code{\link{sample_data-class}}. However,
+#' if \code{value} is a \code{data.frame}, then \code{value} is first coerced to
+#' a \code{\link{sample_data-class}}, and then assigned. Alternatively, if
+#' \code{value} is \code{\link{phyloseq-class}}, then the
+#' \code{\link{sample_data}} component will first be accessed from \code{value}
+#' and then assigned. This makes possible some concise assignment/replacement
+#' statements when adjusting, modifying, or building subsets of
+#' experiment-level data. See some examples below.
+#'
+#' Internally, this re-builds the \code{\link{phyloseq-class}} object using
+#' the standard \code{\link{phyloseq}} constructor. Thus, index mismatches
+#' between sample-describing components will not be allowed, and subsetting
+#' will occurr automatically such that only the intersection of sample IDs
+#' are included in any components. This has the added benefit of re-checking
+#' (internally) for any other issues.
+#'
+#' @usage sample_data(x) <- value
+#'
+#' @param x (Required). \code{\link{phyloseq-class}}. The object to modify.
+#' @param value (Required). Either a \code{\link{sample_data-class}},
+#' a \code{data.frame} that can be coerced into \code{\link{sample_data-class}},
+#' or a \code{\link{phyloseq-class}} that contains a
+#' suitable \code{sample_data} component to assign to \code{x}. If unsure,
+#' try \code{\link{sample_data}}\code{(value)}, which should return a
+#' \code{\link{sample_data-class}} object without error.
+#'
+#' @return No return. This is an assignment statement.
+#'
+#' @export
+#' @rdname assign-sample_data
+#' @aliases assign-sample_data sample_data<-
+#' @examples
+#' data(soilrep)
+#' soilrep
+#' head(sample_data(soilrep))
+#' sample_data(soilrep)$Time <- as.integer(substr(sample_data(soilrep)$Sample, 1, 1))
+#' head(sample_data(soilrep))
+"sample_data<-" <- function(x, value){
+ if( !inherits(value, "sample_data") ){
+ value <- sample_data(value)
+ }
+ phyloseq(x at otu_table, value, x at tax_table, x at phy_tree, x at refseq)
+}
+################################################################################
+#' Assign a (new) Taxonomy Table to \code{x}
+#'
+#' @usage tax_table(x) <- value
+#'
+#' @param x (Required). \code{\link{phyloseq-class}}
+#' @param value (Required). \code{\link{taxonomyTable-class}}.
+#' Alternatively, \code{value} can be a \code{\link{phyloseq-class}} that has
+#' a \code{\link{tax_table}} component, or a \code{\link{matrix-class}}
+#' that can be coerced to a \code{\link{taxonomyTable-class}} with row indices
+#' that match at least some of the \code{\link{taxa_names}} of \code{x}.
+#'
+#' @export
+#' @rdname assign-tax_table
+#' @aliases assign-tax_table tax_table<-
+#' @examples
+#' # data(GlobalPatterns)
+#' # # An example of pruning to just the first 100 taxa in GlobalPatterns.
+#' # ex2a <- prune_taxa(taxa_names(GlobalPatterns)[1:100], GlobalPatterns)
+#' # # The following 3 lines produces an ex2b that is equal to ex2a
+#' # ex2b <- GlobalPatterns
+#' # TT <- tax_table(GlobalPatterns)[1:100, ]
+#' # tax_table(ex2b) <- TT
+#' # identical(ex2a, ex2b)
+#' # print(ex2b)
+#' # # 2 examples adding a tax_table component from phyloseq or matrix classes
+#' # ex2c <- phyloseq(otu_table(ex2b), sample_data(ex2b), phy_tree(ex2b))
+#' # tax_table(ex2c) <- ex2b
+#' # identical(ex2a, ex2c)
+#' # ex2c <- phyloseq(otu_table(ex2b), sample_data(ex2b), phy_tree(ex2b))
+#' # tax_table(ex2c) <- as(tax_table(ex2b), "matrix")
+#' # identical(ex2a, ex2c)
+setGeneric("tax_table<-", function(x, value) standardGeneric("tax_table<-"))
+#' @rdname assign-tax_table
+#' @aliases tax_table<-,phyloseq,taxonomyTable-method
+setMethod("tax_table<-", c("phyloseq", "taxonomyTable"), function(x, value){
+ phyloseq(x at otu_table, x at sam_data, value, x at phy_tree, x at refseq)
+})
+#' @rdname assign-tax_table
+#' @aliases tax_table<-,phyloseq,ANY-method
+setMethod("tax_table<-", c("phyloseq", "ANY"), function(x, value){
+ phyloseq(x at otu_table, x at sam_data, tax_table(value, FALSE), x at phy_tree, x at refseq)
+})
+#' @rdname assign-tax_table
+#' @aliases tax_table<-,taxonomyTable,taxonomyTable-method
+setMethod("tax_table<-", c("taxonomyTable", "taxonomyTable"), function(x, value){
+ # Asign as-is.
+ value
+})
+#' @rdname assign-tax_table
+#' @aliases tax_table<-,taxonomyTable,ANY-method
+setMethod("tax_table<-", c("taxonomyTable", "ANY"), function(x, value){
+ tax_table(value, FALSE)
+})
+################################################################################
+#' Assign a (new) phylogenetic tree to \code{x}
+#'
+#' @usage phy_tree(x) <- value
+#' @param x (Required). \code{\link{phyloseq-class}}
+#' @param value (Required). \code{\link{phylo-class}}, or \code{\link{phyloseq-class}}
+#'
+#' @export
+#' @docType methods
+#' @rdname assign-phy_tree
+#' @aliases assign-phy_tree phy_tree<-
+#' @examples #
+#' data("esophagus")
+#' # An example of pruning to just the first 20 taxa in esophagus
+#' ex2a <- prune_taxa(taxa_names(esophagus)[1:20], esophagus)
+#' # The following 3 lines produces an ex2b that is equal to ex2a
+#' ex2b <- ex2a
+#' phy_tree(ex2b) <- phy_tree(esophagus)
+#' identical(ex2a, ex2b)
+setGeneric("phy_tree<-", function(x, value) standardGeneric("phy_tree<-"))
+#' @rdname assign-phy_tree
+#' @aliases phy_tree<-,phyloseq,phylo-method
+setMethod("phy_tree<-", c("phyloseq", "phylo"), function(x, value){
+ phyloseq(x at otu_table, x at sam_data, x at tax_table, value, x at refseq)
+})
+#' @rdname assign-phy_tree
+#' @aliases phy_tree<-,phyloseq,phyloseq-method
+setMethod("phy_tree<-", c("phyloseq", "phyloseq"), function(x, value){
+ phyloseq(x at otu_table, x at sam_data, x at tax_table, phy_tree(value), x at refseq)
+})
+################################################################################
+#' Replace OTU identifier names
+#'
+#' @usage taxa_names(x) <- value
+#'
+#' @param x (Required). An object defined by the \code{\link{phyloseq-package}}
+#' that describes OTUs in some way.
+#' @param value (Required). A character vector
+#' to replace the current \code{\link{taxa_names}}.
+#'
+#' @export
+#' @docType methods
+#' @rdname assign-taxa_names
+#' @aliases assign-taxa_names taxa_names<-
+#'
+#' @examples
+#' data("esophagus")
+#' taxa_names(esophagus)
+#' # plot_tree(esophagus, label.tips="taxa_names", ladderize="left")
+#' taxa_names(esophagus) <- paste("OTU-", taxa_names(esophagus), sep="")
+#' taxa_names(esophagus)
+#' # plot_tree(esophagus, label.tips="taxa_names", ladderize="left")
+#' ## non-characters are first coerced to characters.
+#' taxa_names(esophagus) <- 1:ntaxa(esophagus)
+#' taxa_names(esophagus)
+#' # plot_tree(esophagus, label.tips="taxa_names", ladderize="left")
+#' ## Cannot assign non-unique or differently-lengthed name vectors. Error.
+#' # taxa_names(esophagus) <- sample(c(TRUE, FALSE), ntaxa(esophagus), TRUE)
+#' # taxa_names(esophagus) <- sample(taxa_names(esophagus), ntaxa(esophagus)-5, FALSE)
+setGeneric("taxa_names<-", function(x, value){
+ if( anyDuplicated(value) ){
+ stop("taxa_names<-: You are attempting to assign duplicated taxa_names")
+ }
+ standardGeneric("taxa_names<-")
+})
+# Attempt to coerce value to a character vector. Remaining methods will require it.
+#' @rdname assign-taxa_names
+#' @aliases taxa_names<-,ANY,ANY-method
+setMethod("taxa_names<-", c("ANY", "ANY"), function(x, value){
+ taxa_names(x) <- as(value, "character")
+ return(x)
+})
+# value is now character, but no specific method for first argumet
+# return x unchanged.
+#' @rdname assign-taxa_names
+#' @aliases taxa_names<-,ANY,character-method
+setMethod("taxa_names<-", c("ANY", "character"), function(x, value){
+ return(x)
+})
+#' @rdname assign-taxa_names
+#' @aliases taxa_names<-,otu_table,character-method
+setMethod("taxa_names<-", c("otu_table", "character"), function(x, value){
+ if( taxa_are_rows(x) ){
+ rownames(x) <- value
+ } else {
+ colnames(x) <- value
+ }
+ return(x)
+})
+#' @rdname assign-taxa_names
+#' @aliases taxa_names<-,taxonomyTable,character-method
+setMethod("taxa_names<-", c("taxonomyTable", "character"), function(x, value){
+ rownames(x) <- value
+ return(x)
+})
+#' @rdname assign-taxa_names
+#' @aliases taxa_names<-,phylo,character-method
+setMethod("taxa_names<-", c("phylo", "character"), function(x, value){
+ x$tip.label <- value
+ return(x)
+})
+#' @rdname assign-taxa_names
+#' @aliases taxa_names<-,XStringSet,character-method
+setMethod("taxa_names<-", c("XStringSet", "character"), function(x, value){
+ names(x) <- value
+ return(x)
+})
+#' @rdname assign-taxa_names
+#' @aliases taxa_names<-,phyloseq,character-method
+setMethod("taxa_names<-", c("phyloseq", "character"), function(x, value){
+ # dispatch on components
+ taxa_names(x at otu_table) <- value
+ taxa_names(x at phy_tree) <- value
+ taxa_names(x at tax_table) <- value
+ taxa_names(x at refseq) <- value
+ return(x)
+})
+################################################################################
+################################################################################
+#' Replace OTU identifier names
+#'
+#' @usage sample_names(x) <- value
+#'
+#' @param x (Required). An object defined by the \code{\link{phyloseq-package}}
+#' that describes OTUs in some way.
+#' @param value (Required). A character vector
+#' to replace the current \code{\link{sample_names}}.
+#'
+#' @export
+#' @docType methods
+#' @rdname assign-sample_names
+#' @aliases assign-sample_names sample_names<-
+#'
+#' @examples
+#' data("esophagus")
+#' sample_names(esophagus)
+#' # plot_tree(esophagus, color="sample_names", ladderize="left")
+#' sample_names(esophagus) <- paste("Sa-", sample_names(esophagus), sep="")
+#' sample_names(esophagus)
+#' # plot_tree(esophagus, color="sample_names", ladderize="left")
+#' ## non-characters are first coerced to characters.
+#' sample_names(esophagus) <- 1:nsamples(esophagus)
+#' sample_names(esophagus)
+#' # plot_tree(esophagus, color="sample_names", ladderize="left")
+#' ## Cannot assign non-unique or differently-lengthed name vectors. Error.
+#' # sample_names(esophagus) <- sample(c(TRUE, FALSE), nsamples(esophagus), TRUE)
+#' # sample_names(esophagus) <- sample(sample_names(esophagus), nsamples(esophagus)-1, FALSE)
+setGeneric("sample_names<-", function(x, value){
+ if( anyDuplicated(value) ){
+ stop("sample_names<-: You are attempting to assign duplicated sample_names")
+ }
+ standardGeneric("sample_names<-")
+})
+# Attempt to coerce value to a character vector. Remaining methods will require it.
+#' @rdname assign-sample_names
+#' @aliases sample_names<-,ANY,ANY-method
+setMethod("sample_names<-", c("ANY", "ANY"), function(x, value){
+ sample_names(x) <- as(value, "character")
+ return(x)
+})
+# value is now character, but no specific method for first argumet
+# return x unchanged.
+#' @rdname assign-sample_names
+#' @aliases sample_names<-,ANY,character-method
+setMethod("sample_names<-", c("ANY", "character"), function(x, value){
+ return(x)
+})
+#' @rdname assign-sample_names
+#' @aliases sample_names<-,otu_table,character-method
+setMethod("sample_names<-", c("otu_table", "character"), function(x, value){
+ if( taxa_are_rows(x) ){
+ colnames(x) <- value
+ } else {
+ rownames(x) <- value
+ }
+ return(x)
+})
+#' @rdname assign-sample_names
+#' @aliases sample_names<-,sample_data,character-method
+setMethod("sample_names<-", c("sample_data", "character"), function(x, value){
+ rownames(x) <- value
+ return(x)
+})
+#' @rdname assign-sample_names
+#' @aliases sample_names<-,phyloseq,character-method
+setMethod("sample_names<-", c("phyloseq", "character"), function(x, value){
+ # dispatch on components
+ sample_names(x at otu_table) <- value
+ sample_names(x at sam_data) <- value
+ return(x)
+})
+################################################################################
\ No newline at end of file
diff --git a/R/deprecated_functions.R b/R/deprecated_functions.R
new file mode 100644
index 0000000..e4c62eb
--- /dev/null
+++ b/R/deprecated_functions.R
@@ -0,0 +1,130 @@
+################################################################################
+#' Depcrecated functions in the phyloseq package.
+#'
+#' These will be migrated to \code{"defunct"} status in the next release,
+#' and removed completely in the release after that.
+#' These functions are provided for compatibility with older version of
+#' the phyloseq package. They may eventually be completely
+#' removed.
+#'
+#' @usage deprecated_phyloseq_function(x, value, ...)
+#' @rdname phyloseq-deprecated
+#' @name phyloseq-deprecated
+#' @param x For assignment operators, the object that will undergo a replacement
+#' (object inside parenthesis).
+#' @param value For assignment operators, the value to replace with
+#' (the right side of the assignment).
+#' @param ... For functions other than assignment operators,
+#' parameters to be passed to the modern version of the function (see table).
+#' @docType package
+#' @export plot_taxa_bar taxaplot taxtab taxTab sampleData samData sam_data speciesSums sampleSums nspecies species.names sampleNames sample.names getSamples getSpecies rank.names getTaxa sample.variables getVariable merge_species otuTable speciesarerows speciesAreRows plot_richness_estimates import_qiime_sampleData filterfunSample genefilterSample prune_species subset_species tipglom taxglom tre show_mothur_list_cutoffs sam_data<- sampleData<- tre<- speciesAreRows<- otuTable<- taxTab<-
+#' @aliases deprecated_phyloseq_function plot_taxa_bar taxaplot taxtab taxTab sampleData samData sam_data speciesSums sampleSums nspecies species.names sampleNames sample.names getSamples getSpecies rank.names getTaxa sample.variables getVariable merge_species otuTable speciesarerows speciesAreRows plot_richness_estimates import_qiime_sampleData filterfunSample genefilterSample prune_species subset_species tipglom taxglom tre show_mothur_list_cutoffs sam_data<- sampleData<- tre<- species [...]
+#' @details
+#' \tabular{rl}{
+#' \code{plot_taxa_bar} \tab now a synonym for \code{\link{plot_bar}}\cr
+#' \code{taxaplot} \tab now a synonym for \code{\link{plot_bar}}\cr
+#' \code{taxtab} \tab now a synonym for \code{\link{tax_table}}\cr
+#' \code{taxTab} \tab now a synonym for \code{\link{tax_table}}\cr
+#' \code{sampleData} \tab now a synonym for \code{\link{sample_data}}\cr
+#' \code{samData} \tab now a synonym for \code{\link{sample_data}}\cr
+#' \code{sam_data} \tab now a synonym for \code{\link{sample_data}}\cr
+#' \code{speciesSums} \tab now a synonym for \code{\link{taxa_sums}}\cr
+#' \code{sampleSums} \tab now a synonym for \code{\link{sample_sums}}\cr
+#' \code{nspecies} \tab now a synonym for \code{\link{ntaxa}}\cr
+#' \code{species.names} \tab now a synonym for \code{\link{taxa_names}}\cr
+#' \code{sampleNames} \tab now a synonym for \code{\link{sample_names}}\cr
+#' \code{sample.names} \tab now a synonym for \code{\link{sample_names}}\cr
+#' \code{getSamples} \tab now a synonym for \code{\link{get_sample}}\cr
+#' \code{getSpecies} \tab now a synonym for \code{\link{get_taxa}}\cr
+#' \code{rank.names} \tab now a synonym for \code{\link{rank_names}}\cr
+#' \code{getTaxa} \tab now a synonym for \code{\link{get_taxa_unique}}\cr
+#' \code{sample.variables} \tab now a synonym for \code{\link{sample_variables}}\cr
+#' \code{getVariable} \tab now a synonym for \code{\link{get_variable}}\cr
+#' \code{merge_species} \tab now a synonym for \code{\link{merge_taxa}}\cr
+#' \code{otuTable} \tab now a synonym for \code{\link{otu_table}}\cr
+#' \code{speciesarerows} \tab now a synonym for \code{\link{taxa_are_rows}}\cr
+#' \code{speciesAreRows} \tab now a synonym for \code{\link{taxa_are_rows}}\cr
+#' \code{plot_richness_estimates} \tab now a synonym for \code{\link{plot_richness}}\cr
+#' \code{import_qiime_sampleData} \tab now a synonym for \code{\link{import_qiime_sample_data}}\cr
+#' \code{filterfunSample} \tab now a synonym for \code{\link{filterfun_sample}}\cr
+#' \code{genefilterSample} \tab now a synonym for \code{\link{genefilter_sample}}\cr
+#' \code{prune_species} \tab now a synonym for \code{\link{prune_taxa}}\cr
+#' \code{subset_species} \tab now a synonym for \code{\link{subset_taxa}}\cr
+#' \code{tipglom} \tab now a synonym for \code{\link{tip_glom}}\cr
+#' \code{taxglom} \tab now a synonym for \code{\link{tax_glom}}\cr
+#' \code{tre} \tab now a synonym for \code{\link{phy_tree}}\cr
+#' \code{show_mothur_list_cutoffs} \tab now a synonym for \code{\link{show_mothur_cutoffs}}\cr
+#' \code{sam_data<-} \tab now a synonym for \code{\link{sample_data<-}}\cr
+#' \code{sampleData<-} \tab now a synonym for \code{\link{sample_data<-}}\cr
+#' \code{tre<-} \tab now a synonym for \code{\link{phy_tree<-}}\cr
+#' \code{speciesAreRows<-} \tab now a synonym for \code{\link{taxa_are_rows<-}}\cr
+#' \code{otuTable<-} \tab now a synonym for \code{\link{otu_table<-}}\cr
+#' \code{taxTab<-} \tab now a synonym for \code{\link{tax_table<-}}\cr
+#' }
+#'
+deprecated_phyloseq_function <- function(x, value, ...){return(NULL)}
+plot_taxa_bar <- function(...){.Deprecated("plot_bar", package="phyloseq");return(plot_bar(...))}
+taxaplot <- function(...){.Deprecated("plot_bar", package="phyloseq");return(plot_bar(...))}
+taxtab <- function(...){.Deprecated("tax_table", package="phyloseq");return(tax_table(...))}
+taxTab <- function(...){.Deprecated("tax_table", package="phyloseq");return(tax_table(...))}
+sampleData <- function(...){.Deprecated("sample_data", package="phyloseq");return(sample_data(...))}
+samData <- function(...){.Deprecated("sample_data", package="phyloseq");return(sample_data(...))}
+sam_data <- function(...){.Deprecated("sample_data", package="phyloseq");return(sample_data(...))}
+speciesSums <- function(...){.Deprecated("taxa_sums", package="phyloseq");return(taxa_sums(...))}
+sampleSums <- function(...){.Deprecated("sample_sums", package="phyloseq");return(sample_sums(...))}
+nspecies <- function(...){.Deprecated("ntaxa", package="phyloseq");return(ntaxa(...))}
+species.names <- function(...){.Deprecated("taxa_names", package="phyloseq");return(taxa_names(...))}
+sampleNames <- function(...){.Deprecated("sample_names", package="phyloseq");return(sample_names(...))}
+sample.names <- function(...){.Deprecated("sample_names", package="phyloseq");return(sample_names(...))}
+getSamples <- function(...){.Deprecated("get_sample", package="phyloseq");return(get_sample(...))}
+getSpecies <- function(...){.Deprecated("get_taxa", package="phyloseq");return(get_taxa(...))}
+rank.names <- function(...){.Deprecated("rank_names", package="phyloseq");return(rank_names(...))}
+getTaxa <- function(...){.Deprecated("get_taxa_unique", package="phyloseq");return(get_taxa_unique(...))}
+sample.variables <- function(...){.Deprecated("sample_variables", package="phyloseq");return(sample_variables(...))}
+getVariable <- function(...){.Deprecated("get_variable", package="phyloseq");return(get_variable(...))}
+merge_species <- function(...){.Deprecated("merge_taxa", package="phyloseq");return(merge_taxa(...))}
+otuTable <- function(...){.Deprecated("otu_table", package="phyloseq");return(otu_table(...))}
+speciesarerows <- function(...){.Deprecated("taxa_are_rows", package="phyloseq");return(taxa_are_rows(...))}
+speciesAreRows <- function(...){.Deprecated("taxa_are_rows", package="phyloseq");return(taxa_are_rows(...))}
+plot_richness_estimates <- function(...){.Deprecated("plot_richness", package="phyloseq");return(plot_richness(...))}
+import_qiime_sampleData <- function(...){.Deprecated("import_qiime_sample_data", package="phyloseq");return(import_qiime_sample_data(...))}
+filterfunSample <- function(...){.Deprecated("filterfun_sample", package="phyloseq");return(filterfun_sample(...))}
+genefilterSample <- function(...){.Deprecated("genefilter_sample", package="phyloseq");return(genefilter_sample(...))}
+prune_species <- function(...){.Deprecated("prune_taxa", package="phyloseq");return(prune_taxa(...))}
+subset_species <- function(...){.Deprecated("subset_taxa", package="phyloseq");return(subset_taxa(...))}
+tipglom <- function(...){.Deprecated("tip_glom", package="phyloseq");return(tip_glom(...))}
+taxglom <- function(...){.Deprecated("tax_glom", package="phyloseq");return(tax_glom(...))}
+tre <- function(...){.Deprecated("phy_tree", package="phyloseq");return(phy_tree(...))}
+show_mothur_list_cutoffs <- function(...){.Deprecated("show_mothur_cutoffs", package="phyloseq");return(show_mothur_cutoffs(...))}
+originalUniFrac <- function(...){.Deprecated("fastUniFrac", package="phyloseq");return(fastUniFrac(...))}
+"sam_data<-" <- function(x, value){
+ .Deprecated("sample_data<-", package="phyloseq")
+ sample_data(x) <- value
+ return(x)
+}
+"sampleData<-" <- function(x, value){
+ .Deprecated("sample_data<-", package="phyloseq")
+ sample_data(x) <- value
+ return(x)
+}
+"tre<-" <- function(x, value){
+ .Deprecated("phy_tree<-", package="phyloseq")
+ phy_tree(x) <- value
+ return(x)
+}
+"speciesAreRows<-" <- function(x, value){
+ .Deprecated("taxa_are_rows<-", package="phyloseq")
+ taxa_are_rows(x) <- value
+ return(x)
+}
+"otuTable<-" <- function(x, value){
+ .Deprecated("otu_table<-", package="phyloseq")
+ otu_table(x) <- value
+ return(x)
+}
+"taxTab<-" <- function(x, value){
+ .Deprecated("tax_table<-", package="phyloseq")
+ tax_table(x) <- value
+ return(x)
+}
+################################################################################
diff --git a/R/distance-methods.R b/R/distance-methods.R
new file mode 100644
index 0000000..99b81fc
--- /dev/null
+++ b/R/distance-methods.R
@@ -0,0 +1,690 @@
+################################################################################
+#' Calculate distance, dissimilarity
+#'
+#' Takes a \code{\link{phyloseq-class}} object and method option, and returns
+#' a \code{\link{dist}}ance object suitable for certain
+#' ordination methods and other distance-based analyses.
+#' Only
+#' sample-wise distances are currently supported (the \code{type} argument),
+#' but eventually species-wise (OTU-wise)
+#' distances may be supported as well.
+#'
+#' Depending on the \code{method}
+#' argument, \code{distance()} wraps one of
+#' \code{\link{UniFrac}},
+#' \code{\link{DPCoA}},
+#' \code{\link{JSD}},
+#' \code{\link[vegan]{vegdist}},
+#' \code{\link[vegan]{betadiver}},
+#' \code{\link[vegan]{designdist}}, or
+#' \code{\link{dist}}.
+#'
+#' @param physeq (Required). A \code{\link{phyloseq-class}} or
+#' an \code{\link{otu_table-class}} object. The latter is only appropriate
+#' for methods that do not require any additional data (one-table).
+#' For example, the ``wunifrac'' option (\code{\link{UniFrac}}) requires
+#' \code{\link{phyloseq-class}} that contains both an \code{otu_table}
+#' and a phylogenetic tree (\code{phylo}).
+#'
+#' @param method (Required). A character string.
+#' Provide one of the currently supported options.
+#' See \code{\link{distanceMethodList}} for a detailed list
+#' of the supported options here,
+#' and links to accompanying documentation.
+#'
+#' Note that for the common definition of \code{Jaccard} distance
+#' using the \code{vegan-package} implementation,
+#' an additional argument is needed, with the full call having the form:
+#' \code{distance(physeq, method = "jaccard", binary = TRUE)}
+#'
+#' The following methods are implemented explicitly within
+#' the \code{\link{phyloseq-package}},
+#' and accessed by the following \code{method} options:
+#'
+#' \describe{
+#' \item{\code{"unifrac"}}{Original (unweighted) UniFrac distance,
+#' \code{\link[phyloseq]{UniFrac}}}
+#' \item{\code{"wunifrac"}}{weighted-UniFrac distance, \code{\link[phyloseq]{UniFrac}}}
+#' \item{\code{"dpcoa"}}{
+#' sample-wise distance used in
+#' Double Principle Coordinate Analysis, \code{\link[phyloseq]{DPCoA}}}
+#' \item{\code{"jsd"}}{Jensen-Shannon Divergence, \code{\link{JSD}}}
+#' }
+#'
+#' Alternatively, you can provide
+#' a character string that defines a custom distance method, if it has the form
+#' described in \code{\link{designdist}}.
+#'
+#' @param type (Optional). A character string. The type of pairwise comparisons
+#' being calculated: sample-wise or taxa-wise. The default is
+#' \code{c("samples")}.
+#'
+#' @param ... Additional arguments passed on to the appropriate distance
+#' function, determined by the \code{method} argument.
+#'
+#' @return An object of class ``\code{\link{dist}}'' suitable for certain
+#' ordination methods and other distance-based analyses.
+#'
+#' @seealso
+#' \code{\link{plot_ordination}},
+#' \code{\link{UniFrac}},
+#' \code{\link{DPCoA}},
+#' \code{\link{JSD}},
+#' \code{\link[vegan]{vegdist}},
+#' \code{\link[vegan]{betadiver}},
+#' \code{\link[vegan]{designdist}},
+#' \code{\link{dist}}.
+#'
+#' @importFrom vegan betadiver
+#' @importFrom vegan designdist
+#' @importFrom vegan vegdist
+#' @export
+#' @examples
+#' data(esophagus)
+#' distance(esophagus, "uunifrac") # Unweighted UniFrac
+#' distance(esophagus, "wunifrac") # weighted UniFrac
+#' distance(esophagus, "jaccard", binary = TRUE) # vegdist jaccard
+#' distance(esophagus, "gower") # vegdist option "gower"
+#' distance(esophagus, "g") # designdist method option "g"
+#' distance(esophagus, "minkowski") # invokes a method from the base dist() function.
+#' distance(esophagus, "(A+B-2*J)/(A+B)") # designdist custom distance
+#' distanceMethodList
+#' help("distance")
+setGeneric("distance", function(physeq, method, type="samples", ...){
+ standardGeneric("distance")
+})
+#' @rdname distance
+setMethod("distance", c("phyloseq", "ANY"), function(physeq, method){
+ stop("You must specify a `method` argument as a character string.
+ \nIt was missing/NA or not a character string.
+ \nSee `?distanceMethodList`")
+})
+#' @rdname distance
+setMethod("distance", c("otu_table", "character"), function(physeq, method, type="samples", ...){
+ OTU = physeq
+ if( method == "jsd" ){
+ return(JSD(OTU))
+ }
+ # Hard-coded dispatch according to certain method groups
+ if( method %in% distanceMethodList$vegdist ){
+ dfun <- "vegdist"
+ } else if( method %in% distanceMethodList$betadiver ){
+ dfun <- "betadiver"
+ } else if( method %in% distanceMethodList$dist ){
+ dfun <- "dist"
+ } else {
+ dfun <- "designdist"
+ }
+ # get the extra arguments to pass to functions (this can be empty)
+ extrargs <- list(...)
+ # If necessary (non phyloseq funs), enforce orientation, build function.
+ # disambiguate type argument... Must be "species" for vegan integration...
+ # The following should all work: "OTUs", "OTU", "otus", "Taxas", "site"
+ type <- gsub("(OTU(s)?)|(taxa(s)?)|(Species)", "species", type, ignore.case = TRUE)
+ # The following should all work: "SaMplE", "Samples", "site", "sites"
+ type <- gsub("(Sample(s)?)|(site(s)?)", "samples", type, ignore.case = TRUE)
+ # Test type, and enforce orientation accordingly
+ if( type == "species"){
+ # For species-distance, species need to be rows (vegan-style)
+ if( !taxa_are_rows(OTU) ){OTU <- t(OTU)}
+ } else if( type == "samples" ){
+ # For sample-distance, samples need to be rows (vegan-style)
+ if( taxa_are_rows(OTU) ){OTU <- t(OTU)}
+ } else {
+ stop("type argument must be one of \n (1) samples \n or \n (2) species")
+ }
+ OTU <- as(OTU, "matrix")
+ fun.args <- c(list(OTU, method=method), extrargs)
+ return( do.call(dfun, fun.args) )
+})
+#' @rdname distance
+setMethod("distance", c("phyloseq", "character"), function(physeq, method, type="samples", ...){
+ # Only one method at a time.
+ if(length(method) > 1){
+ stop("`distance` only accepts one method at a time. ",
+ "You provided ", length(method), " methods. ")
+ }
+ if(length(method) < 1 | is.na(method)){
+ stop("You must specify a `method` argument. \nIt was missing/NA. \nSee `?distanceMethodList`")
+ }
+ # Regular Expression detect/convert unifrac/weighted-UniFrac args
+ method <- gsub("^(u.*)*unifrac$", "unifrac", method, ignore.case = TRUE)
+ method <- gsub("^w.*unifrac$", "wunifrac", method, ignore.case = TRUE)
+ # Distances that require a phyloseq object
+ # because they make use of additional information (e.g. a tree)
+ if( method == "unifrac" ){ return(UniFrac(physeq, ...)) }
+ if( method == "wunifrac" ){ return(UniFrac(physeq, weighted=TRUE, ...)) }
+ if( method == "dpcoa" ){
+ # Remove diagnol entries from "dist" object returned in `RaoDis` slot.
+ return(as.dist(DPCoA(physeq, ...)$RaoDis, diag=FALSE))
+ }
+ # Else, dispatch to OTU table method
+ return(distance(otu_table(physeq), method, type, ...))
+})
+################################################################################
+#' List of distance method keys supported in \code{\link[phyloseq]{distance}}
+#'
+#' Distance methods should be specified by exact string match.
+#' Cannot do partial matching for all options,
+#' because too many similar options in downstream method dispatch.
+#'
+#' @format A list of character vectors.
+#' Every entry specifies a supported distance method.
+#' Names in the list indicate which downstream function
+#' is being utilized for further details.
+#' Same functions are linked in the itemized list below.
+#'
+#' \describe{
+#' \item{\code{unifrac}}{\code{\link[phyloseq]{UniFrac}}}
+#' \item{\code{wunifrac}}{\code{\link[phyloseq]{UniFrac}}}
+#' \item{\code{dpcoa}}{\code{\link[phyloseq]{DPCoA}}}
+#' \item{\code{jsd}}{\code{\link{JSD}}}
+#' \item{\code{manhattan}}{\code{\link[vegan]{vegdist}}}
+#' \item{\code{euclidean}}{\code{\link[vegan]{vegdist}}}
+#' \item{\code{canberra}}{\code{\link[vegan]{vegdist}}}
+#' \item{\code{bray}}{\code{\link[vegan]{vegdist}}}
+#' \item{\code{kulczynski}}{\code{\link[vegan]{vegdist}}}
+#' \item{\code{jaccard}}{\code{\link[vegan]{vegdist}}}
+#' \item{\code{gower}}{\code{\link[vegan]{vegdist}}}
+#' \item{\code{altGower}}{\code{\link[vegan]{vegdist}}}
+#' \item{\code{morisita}}{\code{\link[vegan]{vegdist}}}
+#' \item{\code{horn}}{\code{\link[vegan]{vegdist}}}
+#' \item{\code{mountford}}{\code{\link[vegan]{vegdist}}}
+#' \item{\code{raup}}{\code{\link[vegan]{vegdist}}}
+#' \item{\code{binomial}}{\code{\link[vegan]{vegdist}}}
+#' \item{\code{chao}}{\code{\link[vegan]{vegdist}}}
+#' \item{\code{cao}}{\code{\link[vegan]{vegdist}}}
+#' \item{\code{w}}{\code{\link[vegan]{betadiver}}}
+#' \item{\code{-}}{\code{\link[vegan]{betadiver}}}
+#' \item{\code{c}}{\code{\link[vegan]{betadiver}}}
+#' \item{\code{wb}}{\code{\link[vegan]{betadiver}}}
+#' \item{\code{r}}{\code{\link[vegan]{betadiver}}}
+#' \item{\code{I}}{\code{\link[vegan]{betadiver}}}
+#' \item{\code{e}}{\code{\link[vegan]{betadiver}}}
+#' \item{\code{t}}{\code{\link[vegan]{betadiver}}}
+#' \item{\code{me}}{\code{\link[vegan]{betadiver}}}
+#' \item{\code{j}}{\code{\link[vegan]{betadiver}}}
+#' \item{\code{sor}}{\code{\link[vegan]{betadiver}}}
+#' \item{\code{m}}{\code{\link[vegan]{betadiver}}}
+#' \item{\code{-}}{\code{\link[vegan]{betadiver}}}
+#' \item{\code{co}}{\code{\link[vegan]{betadiver}}}
+#' \item{\code{cc}}{\code{\link[vegan]{betadiver}}}
+#' \item{\code{g}}{\code{\link[vegan]{betadiver}}}
+#' \item{\code{-}}{\code{\link[vegan]{betadiver}}}
+#' \item{\code{l}}{\code{\link[vegan]{betadiver}}}
+#' \item{\code{hk}}{\code{\link[vegan]{betadiver}}}
+#' \item{\code{rlb}}{\code{\link[vegan]{betadiver}}}
+#' \item{\code{sim}}{\code{\link[vegan]{betadiver}}}
+#' \item{\code{gl}}{\code{\link[vegan]{betadiver}}}
+#' \item{\code{z}}{\code{\link[vegan]{betadiver}}}
+#' \item{\code{maximum}}{\code{\link[stats]{dist}}}
+#' \item{\code{binary}}{\code{\link[stats]{dist}}}
+#' \item{\code{minkowski}}{\code{\link[stats]{dist}}}
+#' \item{\code{ANY}}{\code{\link[vegan]{designdist}}}
+#' }
+#'
+#' @seealso
+#' \code{\link[phyloseq]{distance}}
+#'
+#' @export
+#'
+#' @examples
+#' distanceMethodList
+distanceMethodList <- list(
+ UniFrac = c("unifrac", "wunifrac"),
+ DPCoA = "dpcoa",
+ JSD = "jsd",
+ # The methods supported by vegan::vegdist function.
+ vegdist = c("manhattan", "euclidean", "canberra", "bray",
+ "kulczynski", "jaccard", "gower", "altGower", "morisita", "horn",
+ "mountford", "raup" , "binomial", "chao", "cao"),
+ # The methods supported by vegan::betadiver function.
+ betadiver = c("w", "-1", "c", "wb", "r", "I", "e", "t", "me", "j",
+ "sor", "m", "-2", "co", "cc", "g", "-3", "l", "19", "hk", "rlb",
+ "sim", "gl", "z"),
+ dist = c("maximum", "binary", "minkowski"),
+ designdist = "ANY"
+)
+################################################################################
+# Shannon-Jensen Divergence, in R.
+################################################################################
+#' @keywords internal
+phyloseq_JSD_pair <- function(x, y){
+ # Function to compute Shannon-Jensen Divergence
+ # x and y are the frequencies for the same p categories
+ # Assumes relative abundance transformation already happened (for efficiency)
+
+ # Define the mean point
+ m <- (x+y)/2
+ # Define each samples component
+ P1 <- x*log(x/m)
+ P2 <- y*log(y/m)
+ # In the case of zeroes entries log is undefined, JSD is defined as zero
+ P1[!is.finite(P1)] <- 0
+ P2[!is.finite(P2)] <- 0
+ d <- (P1+P2)/2
+ return(sum(d, na.rm = TRUE))
+}
+################################################################################
+#' Calculate the Jensen-Shannon Divergence (distance)
+#'
+#' This is a phyloseq-specific implementation of the Jensen-Shannon Divergence
+#' for comparing pairs of microbial communities (samples) in an experiment.
+#' The expectation is that you have many samples (say. more than two) and you
+#' want a distance matrix on which will perform further analysis. \code{JSD} is
+#' intended to be ``wrapped'' by the more general \code{\link{distance}}
+#' function in phyloseq, and it can be invoked using \code{"jsd"} as the
+#' argument to the \code{method} parameter of \code{\link{distance}}.
+#'
+#' One of the motivations for providing JSD in phyloseq was its recent use in
+#' the analysis of the \code{\link{enterotype}} dataset.
+#'
+#' @param physeq (Required). \code{\link{phyloseq-class}}.
+#' The phyloseq data on which to compute the
+#' pairwise sample distance matrix.
+#'
+#' @return An object of class ``\code{\link{dist}}'' suitable for certain
+#' ordination methods and other distance-based analyses.
+#' See \code{\link{distance}}.
+#'
+#' @seealso
+#' \code{\link{distance}}
+#'
+#' \code{\link{enterotype}}
+#'
+#' \url{http://en.wikipedia.org/wiki/Jensen-Shannon_divergence}
+#'
+#' @references
+#' Jensen-Shannon Divergence and Hilbert space embedding.
+#' Bent Fuglede and Flemming Topsoe University of Copenhagen,
+#' Department of Mathematics
+#' \url{http://www.math.ku.dk/~topsoe/ISIT2004JSD.pdf}
+#'
+#' @author
+#' Susan Holmes \email{susan@@stat.stanford.edu}.
+#' Adapted for phyloseq by Paul J. McMurdie.
+#'
+#' @keywords internal
+#' @examples
+#' # library(doParallel) # Do this and next line only if you have multi-cores
+#' # registerDoParallel(cores=6)
+#' # data(enterotype)
+#' # # ent.jsd <- JSD(enterotype, TRUE) # internal only
+#' # ent.jsd <- distance(enterotype, "jsd", parallel=TRUE)
+#' # ent.PCoA <- ordinate(enterotype, "PCoA", ent.jsd) # Perform principle coordinate analysis
+#' # p <- plot_ordination(enterotype, ent.PCoA, color="Enterotype", shape="SeqTech")
+#' # (p <- p + geom_point(size=5, alpha=0.5))
+setGeneric("JSD", function(physeq){
+ standardGeneric("JSD")
+})
+setMethod("JSD", "ANY", function(physeq){
+ stop("JSD requires specific input classes. Check call and try again")
+})
+setMethod("JSD", "phyloseq", function(physeq){
+ JSD(otu_table(physeq))
+})
+setMethod("JSD", "otu_table", function(physeq){
+ # Coerce to species-as-columns
+ if(taxa_are_rows(physeq)){ physeq <- t(physeq) }
+ # Coerce physeq to matrix and pass on
+ return(JSD(as(physeq, "matrix")))
+})
+# Assumes samples are rows
+setMethod("JSD", "matrix", function(physeq){
+ # Coerce to relative abundance by sample (row)
+ physeq <- sweep(physeq, 1, rowSums(physeq), "/")
+ # Parallelization not needed for this.
+ # Fix at sequential (eventually update code to remove parallelization complexity)
+ registerDoSEQ()
+ # create N x 2 matrix of all pairwise combinations of samples.
+ spn <- combn(row.names(physeq), 2, simplify=FALSE)
+ # initialize DistMat with NAs
+ DistMat <- matrix(NA, nrow(physeq), nrow(physeq))
+ # define the rows/cols of DistMat with the sample names (rownames)
+ rownames(DistMat) <- row.names(physeq)
+ colnames(DistMat) <- row.names(physeq)
+ # optionally-parallel implementation with foreach
+ distlist <- foreach( i = spn, .packages="phyloseq") %dopar% {
+ A <- i[1]
+ B <- i[2]
+ return( phyloseq_JSD_pair(physeq[A, ], physeq[B, ]) )
+ }
+ # return(distlist)
+ # This is in serial, but it is quick.
+ distlist2distmat <- function(i, spn, DL){
+ DistMat[ spn[[i]][2], spn[[i]][1] ] <<- DL[[i]]
+ }
+ junk <- sapply(1:length(spn), distlist2distmat, spn, distlist)
+ return(as.dist(DistMat))
+})
+##############################################################################
+#' Calculate weighted or unweighted (Fast) UniFrac distance for all sample pairs.
+#'
+#' This function calculates the (Fast) UniFrac distance for all sample-pairs
+#' in a \code{\link{phyloseq-class}} object.
+#'
+#' \code{UniFrac()} accesses the abundance
+#' (\code{\link{otu_table-class}}) and a phylogenetic tree (\code{\link{phylo-class}})
+#' data within an experiment-level (\code{\link{phyloseq-class}}) object.
+#' If the tree and contingency table are separate objects, suggested solution
+#' is to combine them into an experiment-level class
+#' using the \code{\link{phyloseq}} function. For example, the following code
+#'
+#' \code{phyloseq(myotu_table, myTree)}
+#'
+#' returns a \code{phyloseq}-class object that has been pruned and comprises
+#' the minimum arguments necessary for \code{UniFrac()}.
+#'
+#' Parallelization is possible for UniFrac calculated with the \code{\link{phyloseq-package}},
+#' and is encouraged in the instances of large trees, many samples, or both.
+#' Parallelization has been implemented via the \code{\link{foreach-package}}.
+#' This means that parallel calls need to be preceded by 2 or more commands
+#' that register the parallel ``backend''. This is acheived via your choice of
+#' helper packages. One of the simplest seems to be the \emph{doParallel} package.
+#'
+#' For more information, see the following links on registering the ``backend'':
+#'
+#' \emph{foreach} package manual:
+#'
+#' \url{http://cran.r-project.org/web/packages/foreach/index.html}
+#'
+#' Notes on parallel computing in \code{R}. Skip to the section describing
+#' the \emph{foreach Framework}. It gives off-the-shelf examples for registering
+#' a parallel backend using the \emph{doMC}, \emph{doSNOW}, or \emph{doMPI} packages:
+#'
+#' \url{http://trg.apbionet.org/euasiagrid/docs/parallelR.notes.pdf}
+#'
+#' Furthermore, as of \code{R} version \code{2.14.0} and higher, a parallel package
+#' is included as part of the core installation, \code{\link{parallel-package}},
+#' and this can be used as the parallel backend with the \code{\link{foreach-package}}
+#' using the adaptor package ``doParallel''.
+#' \url{http://cran.r-project.org/web/packages/doParallel/index.html}
+#'
+#' See the vignette for some simple examples for using doParallel.
+#' \url{http://cran.r-project.org/web/packages/doParallel/vignettes/gettingstartedParallel.pdf}
+#'
+#' UniFrac-specific examples for doParallel are provided in the example
+#' code below.
+#'
+#' @usage UniFrac(physeq, weighted=FALSE, normalized=TRUE, parallel=FALSE, fast=TRUE)
+#'
+#' @param physeq (Required). \code{\link{phyloseq-class}}, containing at minimum
+#' a phylogenetic tree (\code{\link{phylo-class}}) and
+#' contingency table (\code{\link{otu_table-class}}). See
+#' examples below for coercions that might be necessary.
+#'
+#' @param weighted (Optional). Logical. Should use weighted-UniFrac calculation?
+#' Weighted-UniFrac takes into account the relative abundance of species/taxa
+#' shared between samples, whereas unweighted-UniFrac only considers
+#' presence/absence. Default is \code{FALSE}, meaning the unweighted-UniFrac
+#' distance is calculated for all pairs of samples.
+#'
+#' @param normalized (Optional). Logical. Should the output be normalized such that values
+#' range from 0 to 1 independent of branch length values? Default is \code{TRUE}.
+#' Note that (unweighted) \code{UniFrac} is always normalized by total branch-length,
+#' and so this value is ignored when \code{weighted == FALSE}.
+#'
+#' @param parallel (Optional). Logical. Should execute calculation in parallel,
+#' using multiple CPU cores simultaneously? This can dramatically hasten the
+#' computation time for this function. However, it also requires that the user
+#' has registered a parallel ``backend'' prior to calling this function.
+#' Default is \code{FALSE}. If FALSE, UniFrac will register a serial backend
+#' so that \code{foreach::\%dopar\%} does not throw a warning.
+#'
+#' @param fast (Optional). Logical. DEPRECATED.
+#' Do you want to use the ``Fast UniFrac''
+#' algorithm? Implemented natively in the \code{phyloseq-package}.
+#' \code{TRUE} is now the only supported option.
+#' There should be no difference in the output between the two algorithms.
+#' Moreover, the original UniFrac algorithm
+#' only outperforms this implementation of fast-UniFrac if the datasets are so
+#' small
+#' (approximated by the value of \code{ntaxa(physeq) * nsamples(physeq)})
+#' that the difference in time is inconsequential (less than 1 second).
+#' In practice it does not appear that this parameter should
+#' have ever been set to \code{FALSE}, and therefore
+#' the original UniFrac implementation perhaps never should have been supported here.
+#' For legacy code support the option is now deprecated here
+#' (the implementation was an internal function, anyway)
+#' and the \code{fast} option will remain for one release cycle before
+#' being removed completely
+#' in order to avoid causing unsupported-argument errors.
+#'
+#' @return a sample-by-sample distance matrix, suitable for NMDS, etc.
+#'
+#' @seealso
+#'
+#' \code{\link{distance}}
+#'
+#' \code{unifrac} in the picante package.
+#'
+#' @references
+#'
+#' \url{http://bmf.colorado.edu/unifrac/}
+#'
+#' The main implementation (Fast UniFrac) is adapted from the algorithm's
+#' description in:
+#'
+#' Hamady, Lozupone, and Knight,
+#' ``\href{http://www.nature.com/ismej/journal/v4/n1/full/ismej200997a.html}{Fast UniFrac:}
+#' facilitating high-throughput phylogenetic analyses of
+#' microbial communities including analysis of pyrosequencing and PhyloChip data.''
+#' The ISME Journal (2010) 4, 17--27.
+#'
+#' See also additional descriptions of UniFrac in the following articles:
+#'
+#' Lozupone, Hamady and Knight, ``UniFrac - An Online Tool for Comparing Microbial
+#' Community Diversity in a Phylogenetic Context.'', BMC Bioinformatics 2006, 7:371
+#'
+#' Lozupone, Hamady, Kelley and Knight, ``Quantitative and qualitative (beta)
+#' diversity measures lead to different insights into factors that structure
+#' microbial communities.'' Appl Environ Microbiol. 2007
+#'
+#' Lozupone C, Knight R. ``UniFrac: a new phylogenetic method for comparing microbial
+#' communities.'' Appl Environ Microbiol. 2005 71 (12):8228-35.
+#'
+#' @docType methods
+#' @export
+#' @import foreach
+#' @rdname UniFrac-methods
+#' @examples
+#' ################################################################################
+#' # Perform UniFrac on esophagus data
+#' ################################################################################
+#' data("esophagus")
+#' (y <- UniFrac(esophagus, TRUE))
+#' UniFrac(esophagus, TRUE, FALSE)
+#' UniFrac(esophagus, FALSE)
+#' # ################################################################################
+#' # # Now try a parallel implementation using doParallel, which leverages the
+#' # # new 'parallel' core package in R 2.14.0+
+#' # # Note that simply loading the 'doParallel' package is not enough, you must
+#' # # call a function that registers the backend. In general, this is pretty easy
+#' # # with the 'doParallel package' (or one of the alternative 'do*' packages)
+#' # #
+#' # # Also note that the esophagus example has only 3 samples, and a relatively small
+#' # # tree. This is fast to calculate even sequentially and does not warrant
+#' # # parallelized computation, but provides a good quick example for using UniFrac()
+#' # # in a parallel fashion. The number of cores you should specify during the
+#' # # backend registration, using registerDoParallel(), depends on your system and
+#' # # needs. 3 is chosen here for convenience. If your system has only 2 cores, this
+#' # # will probably fault or run slower than necessary.
+#' # ################################################################################
+#' # library(doParallel)
+#' # data(esophagus)
+#' # # For SNOW-like functionality (works on Windows):
+#' # cl <- makeCluster(3)
+#' # registerDoParallel(cl)
+#' # UniFrac(esophagus, TRUE)
+#' # # Force to sequential backed:
+#' # registerDoSEQ()
+#' # # For multicore-like functionality (will probably not work on windows),
+#' # # register the backend like this:
+#' # registerDoParallel(cores=3)
+#' # UniFrac(esophagus, TRUE)
+#' ################################################################################
+setGeneric("UniFrac", function(physeq, weighted=FALSE, normalized=TRUE, parallel=FALSE, fast=TRUE){
+ standardGeneric("UniFrac")
+})
+################################################################################
+#' @aliases UniFrac,phyloseq-method
+#' @rdname UniFrac-methods
+#' @importFrom ape is.rooted
+#' @importFrom ape root
+setMethod("UniFrac", "phyloseq", function(physeq, weighted=FALSE, normalized=TRUE, parallel=FALSE, fast=TRUE){
+ if(is.null(phy_tree(physeq)$edge.length)){
+ stop("Tree has no branch lengths. See tree$edge.length. Cannot compute UniFrac without branch lengths")
+ }
+ # Check if tree is rooted, set random root with warning if it is not.
+ if( !is.rooted(phy_tree(physeq)) ){
+ randoroot = sample(taxa_names(physeq), 1)
+ warning("Randomly assigning root as -- ", randoroot, " -- in the phylogenetic tree in the data you provided.")
+ phy_tree(physeq) <- root(phy=phy_tree(physeq), outgroup=randoroot, resolve.root=TRUE, interactive=FALSE)
+ if( !is.rooted(phy_tree(physeq)) ){
+ stop("Problem automatically rooting tree. Make sure your tree is rooted before attempting UniFrac calculation. See ?ape::root")
+ }
+ }
+ if( fast ){
+ fastUniFrac(physeq, weighted, normalized, parallel)
+ } else {
+ warning("Option `fast=FALSE` is deprecated. Only 'fast' UniFrac is supported in phyloseq.")
+ fastUniFrac(physeq, weighted, normalized, parallel)
+ }
+})
+################################################################################
+# Fast UniFrac for R.
+# Adapted from The ISME Journal (2010) 4, 17-27; doi:10.1038/ismej.2009.97;
+# http://www.nature.com/ismej/journal/v4/n1/full/ismej200997a.html
+################################################################################
+#' @importFrom ape prop.part
+#' @importFrom ape reorder.phylo
+#' @importFrom ape node.depth
+#' @keywords internal
+#' @import foreach
+fastUniFrac <- function(physeq, weighted=FALSE, normalized=TRUE, parallel=FALSE){
+ # Access the needed components. Note, will error if missing in physeq.
+ OTU <- otu_table(physeq)
+ tree <- phy_tree(physeq)
+ # Some important checks.
+ if( is.null(tree$edge.length) ) {
+ stop("Tree has no branch lengths, cannot compute UniFrac")
+ }
+ if( !is.rooted(tree) ) {
+ stop("Rooted phylogeny required for UniFrac calculation")
+ }
+ ### Some parallel-foreach housekeeping.
+ # If user specifies not-parallel run (the default), register the sequential "back-end"
+ if( !parallel ){ registerDoSEQ() }
+ # create N x 2 matrix of all pairwise combinations of samples.
+ spn <- combn(sample_names(physeq), 2, simplify=FALSE)
+ # Make sure OTU is in species-are-rows orientation
+ if( !taxa_are_rows(physeq) ){OTU <- t(OTU)}
+ # Convert to standard matrix
+ OTU <- as(OTU, "matrix")
+ # Enforce that tree and otu_table indices are the same order,
+ # by re-ordering OTU, if needed
+ if( !all(rownames(OTU) == taxa_names(tree)) ){
+ OTU <- OTU[taxa_names(tree), ]
+ }
+ ########################################
+ # Build the requisite matrices as defined
+ # in the Fast UniFrac article.
+ ########################################
+ ## This only needs to happen once in a call to UniFrac.
+ ## Notice that A and B do not appear in this section.
+ # Begin by building the edge descendants matrix (edge-by-sample)
+ # `edge_array`
+ #
+ # Create a list of descendants, starting from the first internal node (root)
+ ntip <- length(tree$tip.label)
+ if(ntip != ntaxa(physeq)) stop("Incompatible tree and OTU table!")
+ # Create a matrix that maps each internal node to its 2 descendants
+ # This matrix doesn't include the tips, so must use node#-ntip to index into it
+ node.desc <- matrix(tree$edge[order(tree$edge[,1]),][,2],byrow=TRUE,ncol=2)
+ # Define the edge_array object
+ # Right now this is a node_array object, each row is a node (including tips)
+ # It will be subset and ordered to match tree$edge later
+ edge_array <- matrix(0, nrow=ntip+tree$Nnode, ncol=nsamples(physeq),
+ dimnames=list(NULL, sample_names=sample_names(physeq)))
+ # Load the tip counts in directly
+ edge_array[1:ntip,] <- OTU
+ # Get a list of internal nodes ordered by increasing depth
+ ord.node <- order(node.depth(tree))[(ntip+1):(ntip+tree$Nnode)]
+ # Loop over internal nodes, summing their descendants to get that nodes count
+ for(i in ord.node){
+ edge_array[i,] <- colSums(edge_array[node.desc[i-ntip,], , drop=FALSE], na.rm = TRUE)
+ }
+ # Keep only those with a parental edge (drops root) and order to match tree$edge
+ edge_array <- edge_array[tree$edge[,2],]
+ # Remove unneeded variables.
+ rm(node.desc)
+ # If unweighted-UniFrac, coerce to a presence-absence contingency, occ
+ if(!weighted){
+ # For unweighted UniFrac, convert the edge_array to an occurrence (presence/absence binary) array
+ edge_occ <- (edge_array > 0) - 0
+ }
+ if( weighted & normalized ){
+ # This is only relevant to weighted-UniFrac.
+ # For denominator in the normalized distance, we need the age of each tip.
+ # 'z' is the tree in postorder order used in calls to .C
+ # Descending order of left-hand side of edge (the ancestor to the node)
+ z = reorder.phylo(tree, order="postorder")
+ # Call phyloseq-internal function that in-turn calls ape's internal
+ # horizontal position function, in C, using the re-ordered phylo object, `z`
+ tipAges = ape_node_depth_edge_length(Ntip = length(tree$tip.label),
+ Nnode = tree$Nnode,
+ edge = z$edge,
+ Nedge = nrow(tree$edge)[1],
+ edge.length = z$edge.length)
+ # Keep only the tips, and add the tip labels in case `z` order differs from `tree`
+ tipAges <- tipAges[1:length(tree$tip.label)]
+ names(tipAges) <- z$tip.label
+ # Explicitly re-order tipAges to match OTU
+ tipAges <- tipAges[rownames(OTU)]
+ }
+ ########################################
+ # optionally-parallel implementation with foreach
+ ########################################
+ samplesums = sample_sums(physeq)
+ distlist <- foreach( i = spn, .packages="phyloseq") %dopar% {
+ A <- i[1]
+ B <- i[2]
+ AT <- samplesums[A]
+ BT <- samplesums[B]
+ if( weighted ){
+ # weighted UniFrac
+ wUF_branchweight <- abs(edge_array[, A]/AT - edge_array[, B]/BT)
+ # calculate the w-UF numerator
+ numerator <- sum({tree$edge.length * wUF_branchweight}, na.rm = TRUE)
+ # if not-normalized weighted UniFrac, just return "numerator";
+ # the u-value in the w-UniFrac description
+ if(!normalized){
+ return(numerator)
+ } else {
+ # denominator (assumes tree-indices and otu_table indices are same order)
+ denominator <- sum({tipAges * (OTU[, A]/AT + OTU[, B]/BT)}, na.rm = TRUE)
+ # return the normalized weighted UniFrac values
+ return(numerator / denominator)
+ }
+ } else {
+ # Unweighted UniFrac
+ # Subset matrix to just columns A and B
+ edge_occ_AB <- edge_occ[, c(A, B)]
+ # Keep only the unique branches. Sum the lengths
+ edge_uni_AB_sum <- sum((tree$edge.length * edge_occ_AB)[rowSums(edge_occ_AB, na.rm=TRUE) < 2, ], na.rm=TRUE)
+ # Normalize this sum to the total branches among these two samples, A and B
+ uwUFpairdist <- edge_uni_AB_sum / sum(tree$edge.length[rowSums(edge_occ_AB, na.rm=TRUE) > 0])
+ return(uwUFpairdist)
+ }
+ }
+ # Initialize UniFracMat with NAs
+ UniFracMat <- matrix(NA_real_, nsamples(physeq), nsamples(physeq))
+ rownames(UniFracMat) <- colnames(UniFracMat) <- sample_names(physeq)
+ # Matrix-assign lower-triangle of UniFracMat. Then coerce to dist and return.
+ matIndices <- do.call(rbind, spn)[, 2:1]
+ # Take care of edge case where there are two samples -> 1 pair of indices -> rbind doesn't return a matrix
+ if(!is.matrix(matIndices)) matIndices <- matrix(matIndices, ncol=2)
+ UniFracMat[matIndices] <- unlist(distlist)
+ return(as.dist(UniFracMat))
+}
+################################################################################
diff --git a/R/extend_DESeq2.R b/R/extend_DESeq2.R
new file mode 100644
index 0000000..45a864e
--- /dev/null
+++ b/R/extend_DESeq2.R
@@ -0,0 +1,72 @@
+################################################################################
+#' Convert phyloseq data to DESeq2 dds object
+#'
+#' No testing is performed by this function. The phyloseq data is converted
+#' to the relevant \code{\link[DESeq2]{DESeqDataSet}} object, which can then be
+#' tested in the negative binomial generalized linear model framework
+#' of the \code{\link[DESeq2]{DESeq}} function in DESeq2 package.
+#' See the
+#' \href{http://joey711.github.io/phyloseq-extensions}{phyloseq-extensions}
+#' tutorials for more details.
+#'
+#' @param physeq (Required). \code{\link{phyloseq-class}}.
+#' Must have a \code{\link{sample_data}} component.
+#'
+#' @param design (Required). A \code{\link{formula}} which specifies the design of the experiment,
+#' taking the form \code{formula(~ x + y + z)}. That is, a formula with right-hand side only.
+#' By default, the functions in this package and DESeq2
+#' will use the last variable in the formula (e.g. \code{z})
+#' for presenting results (fold changes, etc.) and plotting.
+#' When considering your specification of experimental design, you will want to
+#' re-order the levels so that the \code{NULL} set is first.
+#' For example, the following line of code would ensure that Enterotype 1 is used as the
+#' reference sample class in tests by setting it to the first of the factor levels
+#' using the \code{\link{relevel}} function:
+#'
+#' \code{sample_data(entill)$Enterotype <- relevel(sample_data(entill)$Enterotype, "1")}
+#'
+#' @param ... (Optional). Additional named arguments passed to \code{\link[DESeq2]{DESeqDataSetFromMatrix}}.
+#' Most users will not need to pass any additional arguments here.
+#' Most testing-related options should be provided in
+#' a following call to \code{\link[DESeq2]{DESeq}}.
+#'
+#' @return A \code{\link[DESeq2]{DESeqDataSet}} object.
+#'
+#' @seealso
+#'
+#' \code{vignette("phyloseq-mixture-models")}
+#'
+#' The
+#' \href{http://joey711.github.io/phyloseq-extensions}{phyloseq-extensions}
+#' tutorials.
+#'
+#' \code{\link[DESeq2]{DESeq}}
+#'
+#' \code{\link[DESeq2]{results}}
+#'
+#' \code{\link[DESeq2]{DESeqDataSetFromMatrix}}
+#'
+#' @export
+#'
+#' @examples
+#' # Check out the vignette phyloseq-mixture-models for more details.
+#' # vignette("phyloseq-mixture-models")
+#' data(soilrep)
+#' phyloseq_to_deseq2(soilrep, ~warmed)
+phyloseq_to_deseq2 = function(physeq, design, ...){
+ # Need to add check here for missing sample_data
+ if( is.null(sample_data(physeq, FALSE)) ){
+ stop("There must be sample_data present, for specifying experimental design. See ?phyloseq_to_deseq2")
+ }
+ # Enforce orientation. Samples are columns
+ if( !taxa_are_rows(physeq) ){ physeq <- t(physeq)}
+ # Coerce count data to vanilla matrix of integers
+ countData = round(as(otu_table(physeq), "matrix"), digits=0)
+ colData = data.frame(sample_data(physeq))
+ # Create the DESeq data set, dds.
+ if(requireNamespace("DESeq2")){
+ dds <- DESeq2::DESeqDataSetFromMatrix(countData, colData, design, ...)
+ return(dds)
+ }
+}
+################################################################################
diff --git a/R/extend_metagenomeSeq.R b/R/extend_metagenomeSeq.R
new file mode 100644
index 0000000..766a07d
--- /dev/null
+++ b/R/extend_metagenomeSeq.R
@@ -0,0 +1,68 @@
+################################################################################
+#' Convert phyloseq data to MetagenomeSeq MRexperiment object
+#'
+#' No testing is performed by this function. The phyloseq data is converted
+#' to the relevant \code{\link[metagenomeSeq]{MRexperiment-class}} object, which can then be
+#' tested in the zero-inflated mixture model framework
+#' (e.g. \code{\link[metagenomeSeq]{fitZig}})
+#' in the metagenomeSeq package.
+#' See the
+#' \href{http://joey711.github.io/phyloseq-extensions}{phyloseq-extensions}
+#' tutorials for more details.
+#'
+#' @param physeq (Required). \code{\link{phyloseq-class}}.
+#' @param ... (Optional). Additional named arguments passed
+#' to \code{\link[metagenomeSeq]{newMRexperiment}}.
+#' Most users will not need to pass any additional arguments here.
+#'
+#' @return A \code{\link[metagenomeSeq]{MRexperiment-class}} object.
+#'
+#' @seealso
+#'
+#' \code{\link[metagenomeSeq]{fitTimeSeries}}
+#' \code{\link[metagenomeSeq]{fitLogNormal}}
+#' \code{\link[metagenomeSeq]{fitZig}}
+#' \code{\link[metagenomeSeq]{MRtable}}
+#' \code{\link[metagenomeSeq]{MRfulltable}}
+#'
+#' @export
+#' @importFrom Biobase AnnotatedDataFrame
+#'
+#' @examples
+#' # Check out the vignette metagenomeSeq for more details.
+#' # vignette("metagenomeSeq")
+#' data(soilrep)
+#' phyloseq_to_metagenomeSeq(soilrep)
+phyloseq_to_metagenomeSeq = function(physeq, ...){
+ # Enforce orientation. Samples are columns
+ if( !taxa_are_rows(physeq) ){ physeq <- t(physeq)}
+ # Coerce count data to vanilla matrix of integers
+ countData = round(as(otu_table(physeq), "matrix"), digits=0)
+ # Create sample annotation if possible
+ if(!is.null(sample_data(physeq,FALSE))){
+ ADF = AnnotatedDataFrame(data.frame(sample_data(physeq)))
+ } else {
+ ADF = NULL
+ }
+ # Create taxa annotation if possible
+ if(!is.null(tax_table(physeq,FALSE))){
+ TDF = AnnotatedDataFrame(data.frame(OTUname = taxa_names(physeq),
+ data.frame(tax_table(physeq)),row.names = taxa_names(physeq)))
+ } else {
+ TDF = AnnotatedDataFrame(data.frame(OTUname = taxa_names(physeq),
+ row.names = taxa_names(physeq)))
+ }
+ # Create MRexperiment
+ if(requireNamespace("metagenomeSeq")){
+ mrobj = metagenomeSeq::newMRexperiment(counts = countData, phenoData = ADF, featureData = TDF,...)
+ # Calculate normalization factor
+ if (sum(colSums(countData > 0) > 1) < ncol(countData)) {
+ p = suppressMessages(metagenomeSeq::cumNormStat(mrobj))
+ }
+ else {
+ p = suppressMessages(metagenomeSeq::cumNormStatFast(mrobj))
+ }
+ mrobj = metagenomeSeq::cumNorm(mrobj, p = p)
+ return(mrobj)
+ }
+}
diff --git a/R/extend_vegan.R b/R/extend_vegan.R
new file mode 100644
index 0000000..2da390b
--- /dev/null
+++ b/R/extend_vegan.R
@@ -0,0 +1,274 @@
+################################################################################
+# Define S3 methods for scores (originally defined by vegan-package)
+# to work for other ordination results
+# vegan:::scores.default
+################################################################################
+# pcoa-class, from pcoa{ape}
+#' @importFrom vegan wascores
+#' @importFrom vegan scores
+#' @keywords internal
+scores.pcoa <- function(x, choices=NULL, display="sites", physeq=NULL, ...){
+ if(is.null(choices)){
+ choices <- colnames(x$vectors)
+ }
+ co = list(sites = x$vectors[, choices])
+ if( "species" %in% display ){
+ if(is.null(otu_table(physeq, errorIfNULL = FALSE))){
+ warning("scores.pcoa: Failed to access OTU table from `physeq` argument, \n
+ needed for weighted average of OTU/taxa/species points in MDS/PCoA.")
+ } else {
+ # MDS/PCoA only provides coordinates of the elements in the
+ # distance matrix, usually sites/samples, so species (etc.)
+ # This means we need to use the weighted-average as there is
+ # no corresponding axes from the ordination directly.
+ co$species <- wascores(x$vectors[, choices], w = veganifyOTU(physeq))
+ }
+ }
+ co <- co[display]
+ if(length(co) < 2L){
+ # Unlist
+ co <- co[[display]]
+ }
+ return(co)
+}
+################################################################################
+# DPCoA management
+################################################################################
+#' @importFrom vegan scores
+#' @keywords internal
+get_dpcoa_species_coords = function(x, physeq=NULL){
+ # Grab coordinates from the dpcoa object
+ coords = x$dls
+ # ade4 mangles the element names using `make.names` conventions in base R
+ # Replace them in `coords`
+ if(is.null(taxa_names(physeq))){
+ warning("scores.dpcoa: Failed to access `taxa_names` from `physeq` argument, \n
+ needed to ensure correct mapping of OTU/taxa/species points in DPCoA.")
+ } else {
+ # if the names are available, use them
+ # by mapping the same variable-name conversion that ade4 would have used.
+ taxnames = taxa_names(physeq)
+ names(taxnames) <- make.names(taxnames)
+ rownames(coords) <- taxnames[rownames(coords)]
+ }
+ return(coords)
+}
+#' @importFrom vegan scores
+#' @keywords internal
+get_dpcoa_sites_coords = function(x, physeq=NULL){
+ # Grab coordinates from the dpcoa object
+ coords = x$li
+ # ade4 mangles the element names using `make.names` conventions in base R
+ # Replace them in `coords`
+ if(is.null(sample_names(physeq))){
+ warning("scores.dpcoa: Failed to access `sample_names` from `physeq` argument, \n
+ needed to ensure correct mapping of site/sample/library points in DPCoA.")
+ } else {
+ # if the names are available, use them
+ # by mapping the same variable-name conversion that ade4 would have used.
+ samplenames = sample_names(physeq)
+ names(samplenames) <- make.names(samplenames)
+ rownames(coords) <- samplenames[rownames(coords)]
+ }
+ return(coords)
+}
+# dpcoa-class, from ade4
+#' @importFrom vegan scores
+#' @keywords internal
+scores.dpcoa <- function(x, choices=NULL, display="sites", physeq=NULL, ...){
+ # x = ordination
+ # display = "species"
+ coords = NULL
+ # `display` must be either "sites" or "species", per vegan-package convention.
+ coords <- switch(EXPR = display,
+ species = get_dpcoa_species_coords(x, physeq),
+ sites = get_dpcoa_sites_coords(x, physeq))
+ # If no choices selection, take all dimensions/columns
+ if(is.null(choices)){
+ choices <- 1:ncol(coords)
+ }
+ return( coords[, choices, drop=FALSE] )
+}
+################################################################################
+# Extend vegdist for phyloseq classes
+################################################################################
+# \code{\link[vegan]{vegdist}} wrapper for phyloseq classes
+#
+# Trivially-extended S4 method from the \code{\link[vegan]{vegdist}} function,
+# such that S4 classes from the \code{\link{phyloseq-package}} are properly
+# handled / accessed. All parameters passed on to \code{\link[vegan]{vegdist}}
+# verbatim.
+#
+# @seealso \code{\link[vegan]{vegdist}}
+# @rdname vegdist-methods
+# @docType methods
+# @aliases vegdist
+#
+# @examples
+# data(esophagus)
+# vegdist(esophagus, "jaccard")
+#' @importFrom vegan vegdist
+#' @keywords internal
+setGeneric("vegdist")
+################################################################################
+# @aliases vegdist,otu_table-method
+# @rdname vegdist-methods
+#' @importFrom vegan vegdist
+setMethod("vegdist", "otu_table", function(x, method = "bray", binary = FALSE,
+ diag = FALSE, upper = FALSE, na.rm = FALSE, ...){
+ # Make sure in sample-by-species orientation
+ if( taxa_are_rows(x) ){x <- t(x)}
+ # Convert to simple matrix
+ x <- as(x, "matrix")
+ # pass to standard method (compiled C)
+ vegdist(x, method, binary, diag, upper, na.rm, ...)
+})
+################################################################################
+# @aliases vegdist,phyloseq-method
+# @rdname vegdist-methods
+setMethod("vegdist", "phyloseq", function(x, method = "bray", binary = FALSE,
+ diag = FALSE, upper = FALSE, na.rm = FALSE, ...){
+ # Simply access the otu_table
+ x <- otu_table(x)
+ vegdist(x, method, binary, diag, upper, na.rm, ...)
+})
+################################################################################
+#' Summarize alpha diversity
+#'
+#' Performs a number of standard alpha diversity estimates,
+#' and returns the results as a \code{data.frame}.
+#' Strictly speaking, this function is not only estimating richness,
+#' despite its name.
+#' It can operate on the cumulative population of all
+#' samples in the dataset, or by repeating the richness estimates for each
+#' sample individually.
+#' NOTE: You must use untrimmed datasets
+#' for meaningful results, as these estimates (and even the ``observed'' richness)
+#' are highly dependent on the number of singletons. You can always trim the data
+#' later on if needed, just not before using this function.
+#'
+#' @param physeq (Required). \code{\link{phyloseq-class}}, or alternatively,
+#' an \code{\link{otu_table-class}}. The data about which you want to estimate
+#' the richness.
+#'
+#' @param split (Optional). Logical. Should a separate set of richness estimates
+#' be performed for each sample? Or alternatively, pool all samples and
+#' estimate richness of the entire set.
+#'
+#' @param measures (Optional). Default is \code{NULL}, meaning that
+#' all available alpha-diversity measures will be included.
+#' Alternatively, you can specify one or more measures
+#' as a character vector of measure names.
+#' Values must be among those supported:
+#' \code{c("Observed", "Chao1", "ACE", "Shannon", "Simpson", "InvSimpson", "Fisher")}.
+#'
+#' @return A \code{data.frame} of the richness estimates, and their standard error.
+#'
+#' @seealso
+#' Check out the custom plotting function, \code{\link{plot_richness}},
+#' for easily showing the results of different estimates,
+#' with method-specific error-bars.
+#' Also check out the internal functions borrowed from the \code{vegan} package:
+#'
+#' \code{\link[vegan]{estimateR}}
+#'
+#' \code{\link[vegan]{diversity}}
+#'
+#' \code{\link[vegan]{fisherfit}}
+#'
+#' @importFrom vegan estimateR
+#' @importFrom vegan diversity
+#' @importFrom vegan fisher.alpha
+#' @export
+#' @examples
+#' ## There are many more interesting examples at the phyloseq online tutorials.
+#' ## http://joey711.github.com/phyloseq/plot_richness-examples
+#' data("esophagus")
+#' # Default is all available measures
+#' estimate_richness(esophagus)
+#' # Specify just one:
+#' estimate_richness(esophagus, measures="Observed")
+#' # Specify a few:
+#' estimate_richness(esophagus, measures=c("Observed", "InvSimpson", "Shannon", "Chao1"))
+estimate_richness <- function(physeq, split=TRUE, measures=NULL){
+
+ if( !any(otu_table(physeq)==1) ){
+ # Check for singletons, and then warning if they are missing.
+ # These metrics only really meaningful if singletons are included.
+ warning(
+ "The data you have provided does not have\n",
+ "any singletons. This is highly suspicious. Results of richness\n",
+ "estimates (for example) are probably unreliable, or wrong, if you have already\n",
+ "trimmed low-abundance taxa from the data.\n",
+ "\n",
+ "We recommended that you find the un-trimmed data and retry."
+ )
+ }
+
+ # If we are not splitting sample-wise, sum the species. Else, enforce orientation.
+ if( !split ){
+ OTU <- taxa_sums(physeq)
+ } else if( split ){
+ OTU <- as(otu_table(physeq), "matrix")
+ if( taxa_are_rows(physeq) ){ OTU <- t(OTU) }
+ }
+
+ # Define renaming vector:
+ renamevec = c("Observed", "Chao1", "ACE", "Shannon", "Simpson", "InvSimpson", "Fisher")
+ names(renamevec) <- c("S.obs", "S.chao1", "S.ACE", "shannon", "simpson", "invsimpson", "fisher")
+ # If measures was not explicitly provided (is NULL), set to all supported methods
+ if( is.null(measures) ){
+ measures = as.character(renamevec)
+ }
+ # Rename measures if they are in the old-style
+ if( any(measures %in% names(renamevec)) ){
+ measures[measures %in% names(renamevec)] <- renamevec[names(renamevec) %in% measures]
+ }
+
+ # Stop with error if no measures are supported
+ if( !any(measures %in% renamevec) ){
+ stop("None of the `measures` you provided are supported. Try default `NULL` instead.")
+ }
+
+ # Initialize to NULL
+ outlist = vector("list")
+ # Some standard diversity indices
+ estimRmeas = c("Chao1", "Observed", "ACE")
+ if( any(estimRmeas %in% measures) ){
+ outlist <- c(outlist, list(t(data.frame(estimateR(OTU)))))
+ }
+ if( "Shannon" %in% measures ){
+ outlist <- c(outlist, list(shannon = diversity(OTU, index="shannon")))
+ }
+ if( "Simpson" %in% measures ){
+ outlist <- c(outlist, list(simpson = diversity(OTU, index="simpson")))
+ }
+ if( "InvSimpson" %in% measures ){
+ outlist <- c(outlist, list(invsimpson = diversity(OTU, index="invsimpson")))
+ }
+ if( "Fisher" %in% measures ){
+ fisher = tryCatch(fisher.alpha(OTU, se=TRUE),
+ warning=function(w){
+ warning("phyloseq::estimate_richness: Warning in fisher.alpha(). See `?fisher.fit` or ?`fisher.alpha`. Treat fisher results with caution")
+ suppressWarnings(fisher.alpha(OTU, se=TRUE)[, c("alpha", "se")])
+ }
+ )
+ if(!is.null(dim(fisher))){
+ colnames(fisher)[1:2] <- c("Fisher", "se.fisher")
+ outlist <- c(outlist, list(fisher))
+ } else {
+ outlist <- c(outlist, Fisher=list(fisher))
+ }
+ }
+ out = do.call("cbind", outlist)
+ # Rename columns per renamevec
+ namechange = intersect(colnames(out), names(renamevec))
+ colnames(out)[colnames(out) %in% namechange] <- renamevec[namechange]
+ # Final prune to just those columns related to "measures". Use grep.
+ colkeep = sapply(paste0("(se\\.){0,}", measures), grep, colnames(out), ignore.case=TRUE)
+ out = out[, sort(unique(unlist(colkeep))), drop=FALSE]
+ # Make sure that you return a data.frame for reliable performance.
+ out <- as.data.frame(out)
+ return(out)
+}
+################################################################################
diff --git a/R/extract-methods.R b/R/extract-methods.R
new file mode 100644
index 0000000..e334d75
--- /dev/null
+++ b/R/extract-methods.R
@@ -0,0 +1,74 @@
+################################################################################
+# subsetting functions
+# Without these, the default coerces to the base object (e.g. matrix or data.frame)
+################################################################################
+#' Method extensions to extraction operator for phyloseq objects.
+#'
+#' See the documentation for the \code{\link[base]{Extract}} generic,
+#' defined in the R \code{\link[base]{base-package}}
+#' for the expected behavior.
+#'
+#' One special exception to standard behavior of these methods in phyloseq is that
+#' the \code{drop} argument is set internally to \code{FALSE}.
+#' This helps avoid bugs during complicated subsetting with multiple components,
+#' where it is necessary to be able to use a two dimensional indexing even
+#' if one of those dimensions has only 1 rank.
+#' Put another way, these phyloseq-defined extractions never collapse their result
+#' into a vector. See the documentation of \code{\link[base]{Extract}} for
+#' more information about the \code{drop} argument.
+#'
+#' @param j See \code{\link[base]{Extract}}
+#'
+#' @param ... See \code{\link[base]{Extract}}
+#'
+#' @seealso \code{\link[base]{Extract}}
+#'
+#' @export
+#'
+#' @rdname extract-methods
+#' @inheritParams base::Extract
+#' @examples
+#' data(esophagus)
+#' nrow(otu_table(esophagus))
+#' nrow(otu_table(esophagus)[1:5, ])
+setMethod("[", "otu_table", function(x, i, j, ...){
+ newx <- as(x, "matrix")[i, j, drop=FALSE]
+ otu_table(newx, taxa_are_rows(x) )
+})
+# extract parts of sample_data
+#
+#' @export
+#' @rdname extract-methods
+setMethod("[", "sample_data", function(x, i, j, ...){
+ sample_data( data.frame(x)[i, j, drop=FALSE] )
+})
+# extract parts of taxonomyTable
+#
+#' @export
+#' @rdname extract-methods
+setMethod("[", "taxonomyTable", function(x, i, j, ...){
+ # Coerce to matrix, apply std extraction, reconstruct.
+ return( tax_table(as(x, "matrix")[i, j, drop=FALSE]) )
+})
+# A numeric extraction method is already defined in Biostrings for XStringSet
+# Add name-character-based extraction method for XStringSet
+#
+#' @importClassesFrom Biostrings XStringSet
+#' @export
+#' @rdname extract-methods
+setMethod("[", c("XStringSet", "character"), function(x, i){
+ index_vector = match(i, names(x), nomatch=NA_integer_)
+ index_vector = index_vector[!is.na(index_vector)]
+ if( length(index_vector) <= 0 ){
+ warning("[,XStringSet: no valid seq-indices provided, NULL returned")
+ return(NULL)
+ }
+ if( length(index_vector) < length(i) ){
+ warning("[,XStringSet: some seq-name indices invalid, omitted.")
+ }
+ # index_vector is an integer, subsetting now dispatches to standard
+ x = x[index_vector]
+ return(x)
+})
+################################################################################
+################################################################################
diff --git a/R/merge-methods.R b/R/merge-methods.R
new file mode 100644
index 0000000..4f4ce20
--- /dev/null
+++ b/R/merge-methods.R
@@ -0,0 +1,600 @@
+################################################################################
+#' Merge arguments into one phyloseq object.
+#'
+#' Takes a comma-separated list of phyloseq objects as arguments,
+#' and returns the most-comprehensive single phyloseq object possible.
+#'
+#' Higher-order objects can be created if arguments are appropriate component data
+#' types of different
+#' classes, and this should mirror the behavior of the \code{\link{phyloseq}} method,
+#' which is the suggested method if the goal is simply to create a higher-order
+#' phyloseq object from different data types (1 of each class) describing the same experiment.
+#'
+#' By contrast, this method is intended for situations in which one wants to combine
+#' multiple higher-order objects, or multiple core component data objects (e.g. more than one
+#' \code{otu_table}) that should be combined into one object.
+#'
+#' Merges are performed by first separating higher-order objects into
+#' a list of their component objects; then, merging any component objects of the same class
+#' into one object according to the behavior desribed in \code{\link{merge_phyloseq_pair}};
+#' and finally, building back up a merged-object according to the constructor
+#' behavior of the \code{\link{phyloseq}} method. If the arguments contain only a single
+#' component type -- several otu_table objects, for example -- then a single merged object
+#' of that component type is returned.
+#'
+#' @usage merge_phyloseq(...)
+#'
+#' @param ... a comma-separated list of phyloseq objects.
+#'
+#' @return Merges are performed by first separating higher-order objects into
+#' a list of their component objects; then, merging any component objects of the same class
+#' into one object according to the behavior desribed in \code{\link{merge_phyloseq_pair}};
+#' and finally, re-building a merged-object according to the constructor
+#' behavior of the \code{\link{phyloseq}} method. If the arguments contain only a single
+#' component type -- several otu_table objects, for example -- then a single merged object
+#' of the relevant component type is returned.
+#'
+#' Merges between 2 or more tree objects are ultimately done using
+#' \code{\link[ape]{consensus}} from the ape package.
+#' This has the potential to limit somewhat the final data object, because trees
+#' don't merge with other trees in the same granular manner as data tables, and
+#' ultimately the species/taxa in higher-order phyloseq objects will be clipped to
+#' what is contained in the tree. If this an issue, the tree component should
+#' be ommitted from the argument list.
+#'
+#' @export
+#'
+#' @examples #
+#' ## # Make a random complex object
+#' ## OTU1 <- otu_table(matrix(sample(0:5,250,TRUE),25,10), taxa_are_rows=TRUE)
+#' ## tax1 <- tax_table(matrix("abc", 30, 8))
+#' ## map1 <- data.frame( matrix(sample(0:3,250,TRUE),25,10),
+#' ## matrix(sample(c("a","b","c"),150,TRUE), 25, 6) )
+#' ## map1 <- sample_data(map1)
+#' ## exam1 <- phyloseq(OTU1, map1, tax1)
+#' ## x <- exam1
+#' ## x <- phyloseq(exam1)
+#' ## y <- tax_table(exam1)
+#' ## merge_phyloseq(x, y)
+#' ## merge_phyloseq(y, y, y, y)
+merge_phyloseq <- function(...){
+ arguments <- list(...)
+ # create list of all components of all objects
+ comp.list <- list()
+ for( i in 1:length(arguments) ){
+ comp.list <- c(comp.list, splat.phyloseq.objects(arguments[[i]]))
+ }
+ # loop through each component type. Note, list names redundant. will use this
+ merged.list <- list()
+ for( i in unique(names(comp.list)) ){ #i="tax_table"
+ # check if length 1, if so, cat to merged.list.
+ i.list <- comp.list[names(comp.list)==i]
+ if( length(i.list) == 1 ){
+ merged.list <- c(merged.list, i.list)
+ } else {
+ # else, loop through each identically-named objects.
+ x1 <- i.list[[1]]
+ for( j in 2:length(i.list)){
+ x1 <- merge_phyloseq_pair(x1, i.list[[j]])
+ }
+ x1 <- list(x1)
+ names(x1) <- i
+ merged.list <- c(merged.list, x1)
+ }
+ }
+ # Remove names to avoid any conflicts with phyloseq(), which does not need named-arguments
+ names(merged.list) <- NULL
+
+ # Use do.call for calling this variable-length, variable-content argument list.
+ return( do.call(phyloseq, merged.list) )
+}
+################################################################################
+#' Merge pair of phyloseq component data objects of the same class.
+#'
+#' Internal S4 methods to combine pairs of objects of classes specified in the
+#' phyloseq package. These objects must be component data of the same type
+#' (class). This is mainly an internal method, provided to illustrate how
+#' merging is performed by the more general \code{\link{merge_phyloseq}} function.
+#'
+#' The \code{\link{merge_phyloseq}} function is recommended in general.
+#'
+#' Special note: non-identical trees are merged using \code{\link[ape]{consensus}}.
+#'
+#' @usage merge_phyloseq_pair(x, y)
+#'
+#' @param x A character vector of the species in object x that you want to
+#' keep -- OR alternatively -- a logical vector where the kept species are TRUE, and length
+#' is equal to the number of species in object x. If \code{species} is a named
+#' logical, the species retained is based on those names. Make sure they are
+#' compatible with the \code{taxa_names} of the object you are modifying (\code{x}).
+#'
+#' @param y Any \code{phyloseq} object.
+#'
+#' @return A single component data object that matches \code{x} and \code{y}
+#' arguments. The returned object will
+#' contain the union of the species and/or samples of each. If there is redundant
+#' information between a pair of arguments of the same class, the values in \code{x} are
+#' used by default. Abundance values are summed for \code{otu_table} objects
+#' for those elements that describe the same species and sample in \code{x}
+#' and \code{y}.
+#'
+#' @seealso \code{\link{merge_phyloseq}} \code{\link{merge_taxa}}
+#'
+#' @rdname merge_phyloseq_pair-methods
+#' @docType methods
+#' @export
+#'
+#' @examples #
+#' ## # merge two simulated otu_table objects.
+#' ## x <- otu_table(matrix(sample(0:5,200,TRUE),20,10), taxa_are_rows=TRUE)
+#' ## y <- otu_table(matrix(sample(0:5,300,TRUE),30,10), taxa_are_rows=FALSE)
+#' ## xy <- merge_phyloseq_pair(x, y)
+#' ## yx <- merge_phyloseq_pair(y, x)
+#' ## # merge two simulated tax_table objects
+#' ## x <- tax_table(matrix("abc", 20, 6))
+#' ## y <- tax_table(matrix("def", 30, 8))
+#' ## xy <- merge_phyloseq_pair(x, y)
+#' ## # merge two simulated sample_data objects
+#' ## x <- data.frame( matrix(sample(0:3,250,TRUE),25,10),
+#' ## matrix(sample(c("a","b","c"),150,TRUE),25,6) )
+#' ## x <- sample_data(x)
+#' ## y <- data.frame( matrix(sample(4:6,200,TRUE),20,10),
+#' ## matrix(sample(c("d","e","f"),120,TRUE),20,8) )
+#' ## y <- sample_data(y)
+#' ## merge_phyloseq_pair(x, y)
+#' ## data.frame(merge_phyloseq_pair(x, y))
+#' ## data.frame(merge_phyloseq_pair(y, x))
+setGeneric("merge_phyloseq_pair", function(x, y) standardGeneric("merge_phyloseq_pair"))
+################################################################################
+#' @aliases merge_phyloseq_pair,otu_table,otu_table-method
+#' @rdname merge_phyloseq_pair-methods
+setMethod("merge_phyloseq_pair", signature("otu_table", "otu_table"), function(x, y){
+ specRrowsx <- taxa_are_rows(x)
+ new.sp.names <- union(taxa_names(x), taxa_names(y))
+ new.sa.names <- union(sample_names(x), sample_names(y))
+
+ # Create the empty new matrix structure
+ newx <- matrix(0, nrow=length(new.sp.names), ncol=length(new.sa.names),
+ dimnames=list(new.sp.names, new.sa.names))
+
+ # assign a standard taxa_are_rows orientation to TRUE for x and y
+ if( !taxa_are_rows(x) ){ x <- t(x) }
+ if( !taxa_are_rows(y) ){ y <- t(y) }
+
+ # "merge" by addition.
+ newx[rownames(x), colnames(x)] <- x
+ newx[rownames(y), colnames(y)] <- newx[rownames(y), colnames(y)] + y
+
+ # Create the new otu_table object
+ newx <- otu_table(newx, taxa_are_rows=TRUE)
+
+ # Return the orientation that was in x
+ if( !specRrowsx ){ newx <- t(newx) }
+ return(newx)
+})
+################################################################################
+#' @aliases merge_phyloseq_pair,taxonomyTable,taxonomyTable-method
+#' @rdname merge_phyloseq_pair-methods
+setMethod("merge_phyloseq_pair", signature("taxonomyTable", "taxonomyTable"), function(x, y){
+ new.sp.names <- union(rownames(x), rownames(y))
+ new.ta.names <- union(colnames(x), colnames(y))
+
+ # Create the empty new matrix structure
+ newx <- matrix(NA, nrow=length(new.sp.names), ncol=length(new.ta.names),
+ dimnames=list(new.sp.names, new.ta.names))
+
+ # "merge". Overwrite with x information.
+ newx[rownames(y), colnames(y)] <- y
+ newx[rownames(x), colnames(x)] <- x
+
+ # Create the new otu_table object
+ newx <- tax_table(newx)
+
+ return(newx)
+})
+################################################################################
+#' @aliases merge_phyloseq_pair,sample_data,sample_data-method
+#' @rdname merge_phyloseq_pair-methods
+setMethod("merge_phyloseq_pair", signature("sample_data", "sample_data"), function(x, y){
+ new.sa.names <- union(rownames(x), rownames(y))
+ new.va.names <- union(colnames(x), colnames(y))
+
+ partx <- data.frame("X0"=rownames(x), x)
+ party <- data.frame("X0"=rownames(y), y)
+ newx <- merge(partx, party, all=TRUE)
+ # now we have the correct template, lets remove redundant rows.
+ keep.samp.rows <- sapply(unique(as.character(newx[,1])), function(i,nx){
+ rownames(subset(nx, X0==i))[1]
+ },newx)
+ newx <- newx[keep.samp.rows,]
+ rownames(newx) <- as.character(newx$"X0")
+
+ # "merge". Overwrite with x information.
+ newx[rownames(y), colnames(y)] <- data.frame(y)
+ newx[rownames(x), colnames(x)] <- data.frame(x)
+
+ # trim the sample name column
+ newx <- newx[,names(newx)!="X0"]
+
+ # Create the new otu_table object
+ newx <- sample_data(newx)
+ return(newx)
+})
+################################################################################
+#' @aliases merge_phyloseq_pair,phylo,phylo-method
+#' @rdname merge_phyloseq_pair-methods
+#' @importFrom ape consensus
+setMethod("merge_phyloseq_pair", signature("phylo", "phylo"), function(x, y){
+ if(identical(x, y)){
+ return(x)
+ } else {
+ return( consensus(x, y) )
+ }
+})
+################################################################################
+#' @aliases merge_phyloseq_pair,XStringSet,XStringSet-method
+#' @rdname merge_phyloseq_pair-methods
+setMethod("merge_phyloseq_pair", signature("XStringSet", "XStringSet"), function(x, y){
+ if( class(x) != class(y) ){
+ # if class of x and y don't match, throw warning, try anyway (just in case)
+ warning("For merging reference sequence objects, x and y should be same type.\n",
+ "That is, the same subclass of XStringSet. e.g. both DNAStringSet.\n",
+ "Try coercing each to the same compatible class prior to merge.")
+ }
+ # Add to x the stuff that is in y, but not in x
+ add_y_taxa = setdiff(taxa_names(y), taxa_names(x))
+ if( length(add_y_taxa) < 1L ){
+ # If there is nothing from y to add, just return x as-is
+ return(x)
+ } else {
+ # Else, add unique stuff from y only to x (they are both lists!)
+ x = c(x, y[add_y_taxa])
+ return(x)
+ }
+})
+################################################################################
+################################################################################
+#' Merge a subset of the species in \code{x} into one species/taxa/OTU.
+#'
+#' Takes as input an object that describes species/taxa
+#' (e.g. \code{\link{phyloseq-class}}, \code{\link{otu_table-class}},
+#' \code{\link{phylo-class}}, \code{\link{taxonomyTable-class}}),
+#' as well as
+#' a vector of species that should be merged.
+#' It is intended to be able to operate at a low-level such that
+#' related methods, such as \code{\link{tip_glom}} and \code{\link{tax_glom}}
+#' can both reliably call \code{merge_taxa} for their respective purposes.
+#'
+#' @usage merge_taxa(x, eqtaxa, archetype=1)
+#'
+#' @param x (Required). An object that describes species (taxa). This includes
+#' \code{\link{phyloseq-class}}, \code{\link{otu_table-class}}, \code{\link{taxonomyTable-class}},
+#' \code{\link[ape]{phylo}}.
+#'
+#' @param eqtaxa (Required). The species names, or indices, that should be merged together.
+#' If \code{length(eqtaxa) < 2}, then the object \code{x} will be returned
+#' safely unchanged.
+#'
+#' @param archetype (Optional). A single-length numeric or character.
+#' The index of \code{eqtaxa}, or OTU ID,
+#' indicating the species that should be kept to represent
+#' the summed/merged group of species/taxa/OTUs.
+#' The default is to use the OTU with the largest count total
+#' if counts are available, or to use \code{1}
+#' (the first OTU in \code{eqtaxa}) otherwise.
+#' If \code{archetype} is not a valid index or index-name in \code{eqtaxa},
+#' the first will be used, and the value in archetype will be used
+#' as the index-name for the new species.
+#'
+#' @return The object, \code{x}, in its original class, but with the specified
+#' species merged into one entry in all relevant components.
+#'
+#' @seealso \code{\link{tip_glom}}, \code{\link{tax_glom}}, \code{\link{merge_phyloseq}},
+#' \code{\link{merge_samples}}
+#'
+#' @export
+#' @docType methods
+#' @rdname merge_taxa-methods
+#' @examples #
+#' data(esophagus)
+#' tree <- phy_tree(esophagus)
+#' otu <- otu_table(esophagus)
+#' otutree0 <- phyloseq(otu, tree)
+#' # plot_tree(otutree0)
+#' otutree1 <- merge_taxa(otutree0, 1:8, 2)
+#' # plot_tree(esophagus, ladderize="left")
+setGeneric("merge_taxa", function(x, eqtaxa, archetype=1L) standardGeneric("merge_taxa"))
+################################################################################
+#' @keywords internal
+merge_taxa.indices.internal = function(x, eqtaxa, archetype){
+ ## If eqtaxa or archetype are character, interpret them to be OTUs and coerce them to integer indices
+ if( is.character(archetype) ){
+ # If archetype is already an OTU, just assign it to keepIndex
+ keepIndex = which(taxa_names(x) %in% archetype[1L])
+ } else {
+ # Else archetype is the numeric index of the eqtaxa that should be kept.
+ # Need to grab from unmodifed eqtaxa, and then decide
+ archetype = eqtaxa[as.integer(archetype[1L])]
+ if( is.character(archetype) ){
+ # If archetype is now an OTU name, find the index and assign to keepIndex
+ keepIndex = which(taxa_names(x) == archetype[1L])
+ } else {
+ # Otherwise, assume it is a taxa index, and assign to keepIndex
+ keepIndex = as.integer(archetype)
+ }
+ }
+ # Ensure eqtaxa is the integer indices of the taxa that are being merged together
+ if( is.character(eqtaxa) ){
+ # assume OTU name, index it against the OTU names in x
+ eqtaxa = which(taxa_names(x) %in% eqtaxa)
+ } else {
+ # Else assume numeric index of the OTU that are being merged
+ eqtaxa = as.integer(eqtaxa)
+ }
+ # keepIndex is index of the OTU that is kept / everything merged into.
+ # It must be among the set of indices in eqtaxa or there is a logic error. Stop.
+ if( length(keepIndex) <= 0L ){ stop("invalid archetype provided.") }
+ if( !keepIndex %in% eqtaxa ){ stop("invalid archetype provided. It is not part of eqtaxa.") }
+ # removeIndex is the index of each OTU that will be removed
+ removeIndex = setdiff(eqtaxa, keepIndex)
+ # Check that indices are valid
+ allIndices = unlist(list(keepIndex, removeIndex))
+ if( any(allIndices > ntaxa(x) | allIndices < 0L) ){
+ stop("invalid OTU indices provided as eqtaxa or archetype.")
+ }
+ return(list(removeIndex=removeIndex, keepIndex=keepIndex))
+}
+################################################################################
+#' @aliases merge_taxa,phyloseq-method
+#' @rdname merge_taxa-methods
+setMethod("merge_taxa", "phyloseq", function(x, eqtaxa,
+ archetype=eqtaxa[which.max(taxa_sums(x)[eqtaxa])]){
+
+ comp_list <- splat.phyloseq.objects(x)
+ merged_list <- lapply(comp_list, merge_taxa, eqtaxa, archetype)
+ # the element names can wreak havoc on do.call
+ names(merged_list) <- NULL
+ # Re-instantiate the combined object using the species-merged object.
+ do.call("phyloseq", merged_list)
+})
+###############################################################################
+# Don't need to merge anything for sample_data. Return As-is.
+#' @aliases merge_taxa,sample_data-method
+#' @rdname merge_taxa-methods
+setMethod("merge_taxa", "sample_data", function(x, eqtaxa, archetype=1L){
+ return(x)
+})
+###############################################################################
+#' @aliases merge_taxa,otu_table-method
+#' @rdname merge_taxa-methods
+setMethod("merge_taxa", "otu_table", function(x, eqtaxa,
+ archetype=eqtaxa[which.max(taxa_sums(x)[eqtaxa])]){
+
+ if( length(eqtaxa) < 2 ){
+ return(x)
+ }
+ indList = merge_taxa.indices.internal(x, eqtaxa, archetype)
+ removeIndex = indList$removeIndex
+ keepIndex = indList$keepIndex
+ # Merge taxa by summing all the equivalent taxa and assigning to the one in keepIndex
+ if( taxa_are_rows(x) ){
+ x[keepIndex, ] = colSums(x[eqtaxa, ])
+ } else {
+ x[, keepIndex] = rowSums(x[, eqtaxa])
+ }
+ # For speed, use matrix subsetting instead of prune_taxa()
+ if (taxa_are_rows(x)) {
+ x = x[-removeIndex, , drop = FALSE]
+ } else {
+ x = x[, -removeIndex, drop = FALSE]
+ }
+ return(x)
+})
+###############################################################################
+#' @importFrom ape drop.tip
+#' @aliases merge_taxa,phylo-method
+#' @rdname merge_taxa-methods
+setMethod("merge_taxa", "phylo", function(x, eqtaxa, archetype=1L){
+ # If there is nothing to merge, return x as-is
+ if( length(eqtaxa) < 2 ){
+ return(x)
+ }
+ indList = merge_taxa.indices.internal(x, eqtaxa, archetype)
+ removeIndex = indList$removeIndex
+ # If there is too much to merge (tree would have 1 or 0 branches), return NULL/warning
+ if( length(removeIndex) >= (ntaxa(x)-1) ){
+ # Can't have a tree with 1 or fewer tips
+ warning("merge_taxa attempted to reduce tree to 1 or fewer tips.\n tree replaced with NULL.")
+ return(NULL)
+ # Else, drop the removeIndex tips and returns the pruned tree.
+ } else {
+ return( drop.tip(x, removeIndex) )
+ }
+})
+###############################################################################
+#' @importClassesFrom Biostrings XStringSet
+#' @aliases merge_taxa,XStringSet-method
+#' @rdname merge_taxa-methods
+setMethod("merge_taxa", "XStringSet", function(x, eqtaxa, archetype=1L){
+ # If there is nothing to merge, return x as-is
+ if( length(eqtaxa) < 2 ){
+ return(x)
+ }
+ indList = merge_taxa.indices.internal(x, eqtaxa, archetype)
+ removeIndex = indList$removeIndex
+ # If there is too much to merge (refseq would have 0 sequences), return NULL/warning
+ if( length(removeIndex) >= ntaxa(x) ){
+ # Can't have a refseq list with less
+ warning("merge_taxa attempted to reduce reference sequence list to 0 sequences.\n refseq replaced with NULL.")
+ return(NULL)
+ } else {
+ # Else, drop the removeIndex sequences and returns the pruned XStringSet object
+ x <- x[-removeIndex]
+ return(x)
+ }
+})
+################################################################################
+#' @aliases merge_taxa,taxonomyTable-method
+#' @rdname merge_taxa-methods
+setMethod("merge_taxa", "taxonomyTable", function(x, eqtaxa, archetype=1L){
+ if( length(eqtaxa) < 2 ){
+ return(x)
+ }
+ indList = merge_taxa.indices.internal(x, eqtaxa, archetype)
+ removeIndex = indList$removeIndex
+ keepIndex = indList$keepIndex
+ # # # Taxonomy is trivial in ranks after disagreement among merged taxa
+ # # # Make those values NA_character_
+ taxmerge <- as(x, "matrix")[eqtaxa, ]
+ bad_ranks <- apply(taxmerge, 2, function(i){ length(unique(i)) != 1 })
+ # Test if all taxonomies agree. If so, do nothing. Just continue to pruning.
+ if( any(bad_ranks) ){
+ # The col indices of the bad ranks
+ bad_ranks <- min(which(bad_ranks)):length(bad_ranks)
+ # Replace bad taxonomy elements in the archetype only (others are pruned)
+ x[keepIndex, bad_ranks] <- NA_character_
+ }
+ # Finally, remove the OTUs that have been merged into keepIndex
+ return( x[-removeIndex, , drop = FALSE] )
+})
+################################################################################
+################################################################################
+#' Merge samples based on a sample variable or factor.
+#'
+#' The purpose of this method is to merge/agglomerate the sample indices of a
+#' phyloseq object according to a categorical variable contained in a sample_data
+#' or a provided factor.
+#'
+#' NOTE: (\code{\link[ape]{phylo}}) trees and \code{\link{taxonomyTable-class}}
+#' are not modified by this function, but returned in the output object as-is.
+#'
+#' @usage merge_samples(x, group, fun=mean)
+#'
+#' @param x (Required). An instance of a phyloseq class that has sample indices. This includes
+#' \code{\link{sample_data-class}}, \code{\link{otu_table-class}}, and \code{\link{phyloseq-class}}.
+#'
+#' @param group (Required). Either the a single character string matching a variable name in
+#' the corresponding sample_data of \code{x}, or a factor with the same length as
+#' the number of samples in \code{x}.
+#'
+#' @param fun (Optional). The function that will be used to merge the values that
+#' correspond to the same group for each variable. It must take a numeric vector
+#' as first argument and return a single value. Default is \code{\link[base]{mean}}.
+#' Note that this is (currently) ignored for the otu_table, where the equivalent
+#' function is \code{\link[base]{sum}}, but evaluated via \code{\link[base]{rowsum}}
+#' for efficiency.
+#'
+#' @return A phyloseq object that has had its sample indices merged according to
+#' the factor indicated by the \code{group} argument. The output class
+#' matches \code{x}.
+#'
+#' @seealso \code{\link{merge_taxa}}, code{\link{merge_phyloseq}}
+#'
+#' @rdname merge_samples-methods
+#' @docType methods
+#' @export
+#'
+#' @examples #
+#' data(GlobalPatterns)
+#' GP = GlobalPatterns
+#' mergedGP = merge_samples(GlobalPatterns, "SampleType")
+#' SD = merge_samples(sample_data(GlobalPatterns), "SampleType")
+#' print(SD)
+#' print(mergedGP)
+#' sample_names(GlobalPatterns)
+#' sample_names(mergedGP)
+#' identical(SD, sample_data(mergedGP))
+#' # The OTU abundances of merged samples are summed
+#' # Let's investigate this ourselves looking at just the top10 most abundance OTUs...
+#' OTUnames10 = names(sort(taxa_sums(GP), TRUE)[1:10])
+#' GP10 = prune_taxa(OTUnames10, GP)
+#' mGP10 = prune_taxa(OTUnames10, mergedGP)
+#' ocean_samples = sample_names(subset(sample_data(GP), SampleType=="Ocean"))
+#' print(ocean_samples)
+#' otu_table(GP10)[, ocean_samples]
+#' rowSums(otu_table(GP10)[, ocean_samples])
+#' otu_table(mGP10)["Ocean", ]
+setGeneric("merge_samples", function(x, group, fun=mean) standardGeneric("merge_samples"))
+################################################################################
+#' @aliases merge_samples,sample_data-method
+#' @rdname merge_samples-methods
+setMethod("merge_samples", signature("sample_data"), function(x, group, fun=mean){
+ x1 <- data.frame(x)
+
+ # Check class of group and modify if "character"
+ if( class(group)=="character" & length(group)==1 ){
+ if( !group %in% colnames(x) ){stop("group not found among sample variable names.")}
+ group <- x1[, group]
+ }
+ if( class(group)!="factor" ){
+ # attempt to coerce to factor
+ group <- factor(group)
+ }
+
+ # Remove any non-coercable columns.
+ # Philosophy is to keep as much as possible. If it is coercable at all, keep.
+ # Coerce all columns to numeric matrix
+ coercable <- sapply(x1, canCoerce, "numeric")
+ x2 <- sapply(x1[, coercable], as, "numeric")
+ rownames(x2) <- rownames(x1)
+
+ # Perform the aggregation.
+ outdf <- aggregate(x2, list(group), fun)
+ # get rownames from the "group" column (always first)
+ # rownames(outdf) <- as.character(outdf[, 1])
+ rownames(outdf) <- levels(group)
+ # "pop" the first column
+ outdf <- outdf[, -1, drop=FALSE]
+
+ return( sample_data(outdf) )
+})
+################################################################################
+#' @aliases merge_samples,otu_table-method
+#' @rdname merge_samples-methods
+setMethod("merge_samples", signature("otu_table"), function(x, group){
+ # needs to be in sample-by-species orientation
+ if( taxa_are_rows(x) ){ x <- t(x) }
+ # coerce to matrix, x2
+ x2 <- as(x, "matrix")
+
+ # # # #aggregate(x2, list(group), fun)
+ out <- rowsum(x2, group)
+
+ # convert back to otu_table, and return
+ return( otu_table(out, taxa_are_rows=FALSE) )
+})
+################################################################################
+#' @aliases merge_samples,phyloseq-method
+#' @rdname merge_samples-methods
+setMethod("merge_samples", signature("phyloseq"), function(x, group, fun=mean){
+
+ # Check if phyloseq object has a sample_data
+ if( !is.null(sample_data(x, FALSE)) ){
+ # Check class of group and modify if single "character" (column name)
+ if( class(group)=="character" & length(group)==1 ){
+ x1 <- data.frame(sample_data(x))
+ if( !group %in% colnames(x1) ){stop("group not found among sample variable names.")}
+ group <- x1[, group]
+ }
+ # coerce to factor
+ if( class(group)!="factor" ){ group <- factor(group) }
+ # Perform merges.
+ newSM <- merge_samples(sample_data(x), group, fun)
+ newOT <- merge_samples(otu_table(x), group)
+ phyloseqList <- list(newOT, newSM)
+ # Else, the only relevant object to "merge_samples" is the otu_table
+ } else {
+ if( class(group)!="factor" ){ group <- factor(group) }
+ phyloseqList <- list( newOT=merge_samples(otu_table(x), group) )
+ }
+
+ ### Add to build-call-list the remaining components, if present in x.
+ ### NULL is returned by accessor if object lacks requested component/slot.
+ ### Order of objects in list doesn't matter for phyloseq.
+ ### The list should not be named.
+ if( !is.null(access(x, "tax_table")) ){ phyloseqList <- c(phyloseqList, list(tax_table(x))) }
+ if( !is.null(access(x, "phy_tree")) ){ phyloseqList <- c(phyloseqList, list(phy_tree(x))) }
+
+ return( do.call("phyloseq", phyloseqList) )
+})
+################################################################################
diff --git a/R/multtest-wrapper.R b/R/multtest-wrapper.R
new file mode 100644
index 0000000..1c6af8f
--- /dev/null
+++ b/R/multtest-wrapper.R
@@ -0,0 +1,173 @@
+####################################################################################
+# # # # Avoiding full import of multtest to mitigate potential conflicts
+####################################################################################
+#' Multiple testing of taxa abundance according to sample categories/classes
+#'
+#' Please note that it is up to you to perform any necessary
+#' normalizing / standardizing transformations prior to these tests.
+#' See for instance \code{\link{transform_sample_counts}}.
+#'
+#' @param physeq (Required). \code{\link{otu_table-class}} or \code{\link{phyloseq-class}}.
+#' In this multiple testing framework, different taxa correspond to variables
+#' (hypotheses), and samples to observations.
+#'
+#' @param classlabel (Required). A single character index of the sample-variable
+#' in the \code{\link{sample_data}} of \code{physeq} that will be used for multiple testing.
+#' Alternatively, \code{classlabel} can be a custom integer (or numeric coercable
+#' to an integer), character, or factor with
+#' length equal to \code{nsamples(physeq)}.
+#'
+#' NOTE: the default test applied to each taxa is a two-sample two-sided
+#' \code{\link{t.test}}, WHICH WILL FAIL with an error if you provide a data variable
+#' (or custom vector) that contains MORE THAN TWO classes. One alternative to consider
+#' is an F-test, by specifying \code{test="f"} as an additional argument. See
+#' the first example below, and/or further documentation of
+#' \code{\link[multtest]{mt.maxT}} or \code{\link[multtest]{mt.minP}}
+#' for other options and formal details.
+#'
+#' @param minPmaxT (Optional). Character string. \code{"mt.minP"} or \code{"mt.maxT"}.
+#' Default is to use \code{"\link[multtest]{mt.minP}"}.
+#'
+#' @param method (Optional). Additional multiple-hypthesis correction methods.
+#' A character vector from the set \code{\link[stats]{p.adjust.methods}}.
+#' Default is \code{"fdr"}, for the Benjamini and Hochberg (1995) method
+#' to control False Discovery Rate (FDR). This argument is passed on to
+#' \code{\link[stats]{p.adjust}}, please see that documentation for more details.
+#'
+#' @param ... (Optional). Additional arguments, forwarded to
+#' \code{\link[multtest]{mt.maxT}} or \code{\link[multtest]{mt.minP}}
+#'
+#' @return A dataframe with components specified in the documentation for
+#' \code{\link[multtest]{mt.maxT}} or \code{\link[multtest]{mt.minP}}, respectively.
+#'
+#' @seealso
+#'
+#' \code{\link[multtest]{mt.maxT}}
+#'
+#' \code{\link[multtest]{mt.minP}}
+#'
+#' \code{\link[stats]{p.adjust}}
+#'
+#' @rdname mt-methods
+#' @docType methods
+#' @export
+#'
+#' @importFrom multtest mt.maxT
+#' @importFrom multtest mt.minP
+#' @importFrom stats p.adjust
+#' @importFrom stats p.adjust.methods
+#'
+#' @examples
+#' ## # Simple example, testing genera that sig correlate with Enterotypes
+#' data(enterotype)
+#' # Filter samples that don't have Enterotype
+#' x <- subset_samples(enterotype, !is.na(Enterotype))
+#' # (the taxa are at the genera level in this dataset)
+#' res = mt(x, "Enterotype", method=c("fdr", "bonferroni"), test="f", B=300)
+#' head(res, 10)
+#' ## # Not surprisingly, Prevotella and Bacteroides top the list.
+#' ## # Different test, multiple-adjusted t-test, whether samples are ent-2 or not.
+#' ## mt(x, get_variable(x, "Enterotype")==2)
+setGeneric("mt", function(physeq, classlabel, minPmaxT="minP", method="fdr", ...) standardGeneric("mt") )
+################################################################################
+# First, access the otu_table, and if appropriate, define classlabel from
+# the sample_data.
+#' @aliases mt,phyloseq,ANY-method
+#' @rdname mt-methods
+setMethod("mt", c("phyloseq", "ANY"), function(physeq, classlabel, minPmaxT="minP", method="fdr", ...){
+ # Extract the class information from the sample_data
+ # if sample_data slot is non-empty,
+ # and the classlabel is a character-class
+ # and its length is 1.
+ if( !is.null(sample_data(physeq, FALSE)) &
+ inherits(classlabel, "character") &
+ identical(length(classlabel), 1L) ){
+ # Define a raw factor based on the data available in a sample variable
+ rawFactor = get_variable(physeq, classlabel[1])
+ if( !inherits(rawFactor, "factor") ){
+ # coerce to a factor if it is not already one.
+ rawFactor = factor(rawFactor)
+ }
+ # Either way, replace `classlabel` with `rawFactor`
+ classlabel = rawFactor
+ }
+ # Either way, dispatch `mt` on otu_table(physeq)
+ MT = mt(otu_table(physeq), classlabel, minPmaxT, ...)
+ if( !is.null(tax_table(physeq, FALSE)) ){
+ # If there is tax_table data present,
+ # add/cbind it to the results.
+ MT = cbind(MT, as(tax_table(physeq), "matrix")[rownames(MT), , drop=FALSE])
+ }
+ if(length(method)>0 & method %in% p.adjust.methods){
+ # Use only the supported methods
+ method <- method[which(method %in% p.adjust.methods)]
+ # Add adjust-p columns. sapply should retain the names.
+ adjp = sapply(method, function(meth, p){p.adjust(p, meth)}, p = MT$rawp, USE.NAMES = TRUE)
+ MT <- cbind(MT, adjp)
+ }
+ return(MT)
+})
+################################################################################
+# All valid mt() calls eventually funnel dispatch to this method.
+# The otu_table orientation is checked/handled here (and only here).
+#' @aliases mt,otu_table,integer-method
+#' @rdname mt-methods
+setMethod("mt", c("otu_table", "integer"), function(physeq, classlabel, minPmaxT="minP", ...){
+ # Guarantee proper orientation of abundance table, and coerce to matrix.
+ if( !taxa_are_rows(physeq) ){ physeq <- t(physeq) }
+ mt.phyloseq.internal(as(physeq, "matrix"), classlabel, minPmaxT, ...)
+})
+################################################################################
+# Coerce numeric classlabel to be integer, pass-on
+#' @aliases mt,otu_table,numeric-method
+#' @rdname mt-methods
+setMethod("mt", c("otu_table", "numeric"), function(physeq, classlabel, minPmaxT="minP", ...){
+ mt(physeq, as(classlabel, "integer"), minPmaxT="minP", ...)
+})
+################################################################################
+# Coerce logical to integer, pass-on
+#' @aliases mt,otu_table,logical-method
+#' @rdname mt-methods
+setMethod("mt", c("otu_table", "logical"), function(physeq, classlabel, minPmaxT="minP", ...){
+ mt(physeq, as(classlabel, "integer"), minPmaxT="minP", ...)
+})
+################################################################################
+# Test for length, then dispatch...
+#' @aliases mt,otu_table,character-method
+#' @rdname mt-methods
+setMethod("mt", c("otu_table", "character"), function(physeq, classlabel, minPmaxT="minP", ...){
+ if( length(classlabel) != nsamples(physeq) ){
+ stop("classlabel not the same length as nsamples(physeq)")
+ } else {
+ classlabel <- factor(classlabel)
+ }
+ # Use mt dispatch with classlabel now a suitable classlabel
+ mt(physeq, classlabel, minPmaxT, ...)
+})
+################################################################################
+# Coerce factor to an integer vector of group labels,
+# starting at 0 for the first group
+#' @aliases mt,otu_table,factor-method
+#' @rdname mt-methods
+setMethod("mt", c("otu_table", "factor"), function(physeq, classlabel, minPmaxT="minP", ...){
+ # integerize classlabel, starting at 0
+ classlabel <- (0:(length(classlabel)-1))[classlabel]
+ # Use mt dispatch with classlabel now a suitable classlabel
+ mt(physeq, classlabel, minPmaxT, ...)
+})
+####################################################################################
+# Internal function
+# @aliases mt,matrix,integer-method
+# not exported
+#' @keywords internal
+mt.phyloseq.internal <- function(physeq, classlabel, minPmaxT="minP", ...){
+ # require(multtest)
+ if( minPmaxT == "minP" ){
+ return( mt.minP(physeq, classlabel, ...) )
+ } else if( minPmaxT == "maxT" ){
+ return( mt.maxT(physeq, classlabel, ...) )
+ } else {
+ print("Nothing calculated. minPmaxT argument must be either minP or maxT.")
+ }
+}
+####################################################################################
diff --git a/R/network-methods.R b/R/network-methods.R
new file mode 100644
index 0000000..34f129b
--- /dev/null
+++ b/R/network-methods.R
@@ -0,0 +1,171 @@
+################################################################################
+#' Make microbiome network (igraph)
+#'
+#' A specialized function for creating a network representation of microbiomes,
+#' sample-wise or taxa-wise,
+#' based on a user-defined ecological distance and (potentially arbitrary) threshold.
+#' The graph is ultimately represented using the
+#' \code{igraph}-package.
+#'
+#' @usage make_network(physeq, type="samples", distance="jaccard", max.dist = 0.4,
+#' keep.isolates=FALSE, ...)
+#'
+#' @param physeq (Required). Default \code{NULL}.
+#' A \code{\link{phyloseq-class}} object,
+#' or \code{\link{otu_table-class}} object,
+#' on which \code{g} is based. \code{phyloseq-class} recommended.
+#'
+#' @param type (Optional). Default \code{"samples"}.
+#' Whether the network should be samples or taxa/OTUs.
+#' Supported arguments are \code{"samples"}, \code{"taxa"},
+#' where \code{"taxa"} indicates using the OTUs/taxaindices,
+#' whether they actually represent species or some other taxonomic rank.
+#'
+#' NOTE: not all distance methods are supported if \code{"taxa"}
+#' selected for type. For example, the UniFrac distance and DPCoA
+#' cannot be calculated for taxa-wise distances, because they use
+#' a taxa-wise tree as part of their calculation between samples, and
+#' there is no transpose-equivalent for this tree.
+#'
+#' @param distance (Optional). Default \code{"jaccard"}.
+#' Any supported argument to the \code{method} parameter of the
+#' \code{\link{distance}} function is supported here.
+#' Some distance methods, like \code{"unifrac"}, may take
+#' a non-trivial amount of time to calculate, in which case
+#' you probably want to calculate the distance matrix separately,
+#' save, and then provide it as the argument to \code{distance} instead.
+#' See below for alternatives).
+#'
+#' Alternatively, if you have already calculated the sample-wise distance
+#' object, the resulting \code{dist}-class object
+#' can be provided as \code{distance} instead (see examples).
+#'
+#' A third alternative is to provide a function that takes
+#' a sample-by-taxa matrix (typical vegan orientation)
+#' and returns a sample-wise distance
+#' matrix.
+#'
+#' @param max.dist (Optional). Default \code{0.4}.
+#' The maximum ecological distance (as defined by \code{distance})
+#' allowed between two samples to still consider them ``connected''
+#' by an edge in the graphical model.
+#'
+#' @param keep.isolates (Optional). Default \code{FALSE}. Logical.
+#' Whether to keep isolates (un-connected samples, not microbial isolates)
+#' in the graphical model that is returned. Default results in isolates
+#' being removed from the object.
+#'
+#' @param ... (Optional). Additional parameters passed on to \code{\link{distance}}.
+#'
+#' @return A \code{igraph}-class object.
+#'
+#' @seealso
+#' \code{\link{plot_network}}
+#'
+#' @importFrom igraph graph.adjacency
+#' @importFrom igraph V
+#' @importFrom igraph delete.vertices
+#' @importFrom igraph degree
+#' @importFrom igraph vcount
+#'
+#' @export
+#'
+#' @examples
+#' # # Example plots with Enterotype Dataset
+#' data(enterotype)
+#' ig <- make_network(enterotype, max.dist=0.3)
+#' plot_network(ig, enterotype, color="SeqTech", shape="Enterotype", line_weight=0.3, label=NULL)
+#' #
+#' ig1 <- make_network(enterotype, max.dist=0.2)
+#' plot_network(ig1, enterotype, color="SeqTech", shape="Enterotype", line_weight=0.3, label=NULL)
+#' #
+#' # # Three methods of choosing/providing distance/distance-method
+#' # Provide method name available to distance() function
+#' ig <- make_network(enterotype, max.dist=0.3, distance="jaccard")
+#' # Provide distance object, already computed
+#' jaccdist <- distance(enterotype, "jaccard")
+#' ih <- make_network(enterotype, max.dist=0.3, distance=jaccdist)
+#' # Provide "custom" function.
+#' ii <- make_network(enterotype, max.dist=0.3, distance=function(x){vegan::vegdist(x, "jaccard")})
+#' # The have equal results:
+#' all.equal(ig, ih)
+#' all.equal(ig, ii)
+#' #
+#' # Try out making a trivial "network" of the 3-sample esophagus data,
+#' # with weighted-UniFrac as distance
+#' data(esophagus)
+#' ij <- make_network(esophagus, "samples", "unifrac", weighted=TRUE)
+make_network <- function(physeq, type="samples", distance="jaccard", max.dist = 0.4,
+ keep.isolates=FALSE, ...){
+
+ if( type %in% c("taxa", "species", "OTUs", "otus", "otu")){
+ # Calculate or asign taxa-wise distance matrix
+ if( class(distance) == "dist" ){
+ # If distance a distance object, use it rather than re-calculate
+ obj.dist <- distance
+ if( attributes(obj.dist)$Size != ntaxa(physeq) ){
+ stop("ntaxa(physeq) does not match size of dist object in distance")
+ }
+ if( !setequal(attributes(obj.dist)$Labels, taxa_names(physeq)) ){
+ stop("taxa_names does not exactly match dist-indices")
+ }
+ } else if( class(distance) == "character" ){
+ # If character string, pass on to distance(), assume supported
+ obj.dist <- distance(physeq, method=distance, type=type, ...)
+ # Else, assume a custom function and attempt to calculate.
+ } else {
+ # Enforce orientation for taxa-wise distances
+ if( !taxa_are_rows(physeq) ){ physeq <- t(physeq) }
+ # Calculate distances
+ obj.dist <- distance(as(otu_table(physeq), "matrix"))
+ }
+ # coerce distance-matrix back into vanilla matrix, Taxa Distance Matrix, TaDiMa
+ TaDiMa <- as.matrix(obj.dist)
+ # Add Inf to the diagonal to avoid self-connecting edges (inefficient)
+ TaDiMa <- TaDiMa + diag(Inf, ntaxa(physeq), ntaxa(physeq))
+ # Convert distance matrix to coincidence matrix, CoMa, using max.dist
+ CoMa <- TaDiMa < max.dist
+ } else if( type == "samples" ){
+ # Calculate or asign sample-wise distance matrix
+ if( class(distance) == "dist" ){ # If argument is already a distance matrix.
+ # If distance a distance object, use it rather than re-calculate
+ obj.dist <- distance
+ if( attributes(obj.dist)$Size != nsamples(physeq) ){
+ stop("nsamples(physeq) does not match size of dist object in distance")
+ }
+ if( !setequal(attributes(obj.dist)$Labels, sample_names(physeq)) ){
+ stop("sample_names does not exactly match dist-indices")
+ }
+ # If character string, pass on to distance(), assume supported
+ } else if( class(distance) == "character" ){
+ # Else, assume a custom function and attempt to calculate.
+ obj.dist <- distance(physeq, method=distance, type=type, ...)
+ } else {
+ # Enforce orientation for sample-wise distances
+ if(taxa_are_rows(physeq)){ physeq <- t(physeq) }
+ # Calculate distances
+ obj.dist <- distance(as(otu_table(physeq), "matrix"))
+ }
+ # coerce distance-matrix back into vanilla matrix, Sample Distance Matrix, SaDiMa
+ SaDiMa <- as.matrix(obj.dist)
+ # Add Inf to the diagonal to avoid self-connecting edges (inefficient)
+ SaDiMa <- SaDiMa + diag(Inf, nsamples(physeq), nsamples(physeq))
+ # Convert distance matrix to coincidence matrix, CoMa, using max.dist
+ CoMa <- SaDiMa < max.dist
+ } else {
+ stop("type argument must be one of \n (1) samples \n or \n (2) taxa")
+ }
+ # Calculate the igraph-formatted network
+ ig <- graph.adjacency(CoMa, mode="lower")
+ if( !keep.isolates ){
+ # If not-keeping isolates, remove them
+ isolates <- V(ig)[degree(ig) == 0]
+ ig = delete.vertices(ig, V(ig)[degree(ig) == 0])
+ }
+ if( vcount(ig) < 2 ){
+ # Report a warning if the graph is empty
+ warning("The graph you created has too few vertices. Consider changing `max.dist` argument, and check your data.")
+ }
+ return(ig)
+}
+################################################################################
diff --git a/R/ordination-methods.R b/R/ordination-methods.R
new file mode 100644
index 0000000..b70a0cf
--- /dev/null
+++ b/R/ordination-methods.R
@@ -0,0 +1,642 @@
+################################################################################
+#' Perform an ordination on phyloseq data
+#'
+#' This function wraps several commonly-used ordination methods. The type of
+#' ordination depends upon the argument to \code{method}. Try
+#' \code{ordinate("help")} or \code{ordinate("list")} for the currently
+#' supported method options.
+#'
+#' @param physeq (Required). Phylogenetic sequencing data
+#' (\code{\link{phyloseq-class}}). The data on which you want to perform
+#' the ordination. In general, these methods will be based in some fashion on
+#' the abundance table ultimately stored as a contingency matrix
+#' (\code{\link{otu_table-class}}). If you're able to import data into
+#' \code{\link{phyloseq-class}} format, than you don't need to worry, as an
+#' \code{otu_table} is a required component of this class. In addition, some
+#' ordination methods require additional data, like a constraining variable
+#' or phylogenetic tree. If that is the case, the relevant data should be
+#' included in \code{physeq} prior to running. Integrating the data in this way
+#' also results in these different data components being checked for validity
+#' and completeness by the method.
+#'
+#' @param method (Optional). A character string. Default is \code{"DCA"}.
+#'
+#' Currently supported method options are:
+#' \code{c("DCA", "CCA", "RDA", "CAP", "DPCoA", "NMDS", "MDS", "PCoA")}
+#'
+#' \describe{
+#' \item{DCA}{Performs detrended correspondence analysis using\code{\link{decorana}}}
+#' \item{CCA}{Performs correspondence analysis,
+#' or optionally, constrained correspondence analysis
+#' (a.k.a. canonical correspondence analysis),
+#' via \code{\link[vegan]{cca}}}
+#' \item{RDA}{Performs redundancy analysis, or optionally
+#' principal components analysis, via \code{\link[vegan]{rda}}}
+#' \item{CAP}{[Partial] Constrained Analysis of Principal Coordinates
+#' or distance-based RDA, via \code{\link[vegan]{capscale}}.
+#' See \code{\link[phyloseq]{capscale.phyloseq}} for more details.
+#' In particular, a \code{\link{formula}} argument must be provided.}
+#' \item{DPCoA}{Performs Double Principle Coordinate Analysis using a
+#' (corrected, if necessary) phylogenetic/patristic distance
+#' between species. The calculation is performed by
+#' \code{\link{DPCoA}}(), which ultimately uses
+#' \code{\link[ade4]{dpcoa}} after making the appropriate
+#' accessions/corrections of the data.}
+#' \item{NMDS}{Performs Non-metric MultiDimenstional Scaling of a sample-wise
+#' ecological distance matrix onto a user-specified number of axes, \code{k}.
+#' By default, \code{k=2}, but this can be modified as a supplementary argument.
+#' This method is ultimately carried out by \code{\link{metaMDS}} after the
+#' appropriate accessions and distance calculations.
+#' Because \code{metaMDS} includes its own distance
+#' calculation wrappers to \code{\link[vegan]{vegdist}}, and these provide
+#' additional functionality in the form of species scores,
+#' \code{ordinate} will pass-on the \code{distance}
+#' argument to \code{metaMDS} if it is among the
+#' supported \code{vegdist} methods. However, all distance methods
+#' supported by \code{\link{distance}} are supported here,
+#' including \code{"unifrac"} (the default) and \code{"DPCoA"}.}
+#' \item{MDS/PCoA}{Performs principal coordinate analysis
+#' (also called principle coordinate decomposition,
+#' multidimensional scaling (MDS), or classical scaling)
+#' of a distance matrix (Gower 1966),
+#' including two correction methods for negative eigenvalues.
+#' See
+#' \code{\link[ape]{pcoa}} for further details.
+#' }
+#' }
+#'
+#' @param distance (Optional). A character string. Default is \code{"bray"}.
+#' The name of a supported \code{\link{distance}} method;
+#' or, alternatively,
+#' a pre-computed \code{\link{dist}}-class object.
+#' This argument is only utilized
+#' if a distance matrix is required by the ordination method specified by the
+#' \code{method} argument (above).
+#'
+#' Any supported \code{\link{distance}} methods
+#' are supported arguments to \code{distance} here.
+#' See \code{\link{distance}} for more details, examples.
+#'
+#' @param formula (Optional). A model \code{\link{formula}}.
+#' Only relevant for certain ordination methods.
+#' The left hand side is ignored, defined by
+#' the \code{physeq} and \code{distance} arguemnts.
+#' The right hand side gives the constraining variables,
+#' and conditioning variables can be given
+#' within a special function \code{Condition}.
+#' See \code{\link[vegan]{cca}} or \code{\link[vegan]{capscale}}
+#' for examples/details.
+#'
+#' @param ... (Optional). Additional arguments to supporting functions. For
+#' example, the additional argument \code{weighted=TRUE} would be passed on
+#' to \code{\link{UniFrac}} if \code{"unifrac"} were chosen as the
+#' \code{distance} option and \code{"MDS"} as the ordination \code{method}
+#' option. Alternatively, if \code{"DCA"} were chosen as the
+#' ordination \code{method} option, additional arguments would be passed on
+#' to the relevant ordination function, \code{\link{decorana}}, for example.
+#'
+#' @return
+#' An ordination object. The specific class of the returned object depends upon the
+#' ordination method, as well as the function/package that is called internally
+#' to perform it.
+#' As a general rule, any of the ordination classes
+#' returned by this function will be recognized by downstream tools in the
+#' \code{phyloseq} package, for example the ordination plotting
+#' function, \code{\link{plot_ordination}}.
+#'
+#' @seealso
+#' \href{http://joey711.github.io/phyloseq/plot_ordination-examples}{The plot_ordination Tutorial}
+#'
+#' Related component ordination functions described within phyloseq:
+#'
+#' \code{\link{DPCoA}}
+#'
+#' Described/provided by other packages:
+#'
+#' \code{\link{cca}}/\code{\link{rda}}, \code{\link{decorana}}, \code{\link{metaMDS}},
+#' \code{\link{pcoa}}, \code{\link[vegan]{capscale}}
+#'
+#' NMDS and MDS/PCoA both operate on distance matrices, typically based on some
+#' pairwise comparison of the microbiomes in an experiment/project. There are
+#' a number of common methods to use to calculate these pairwise distances, and
+#' the most convenient function (from a \code{phyloseq} point of view) for calculating
+#' these distance matrices is the
+#'
+#' \code{\link{distance}}
+#'
+#' function. It can be
+#' thought of as a distance / dissimilarity-index companion function for
+#' \code{ordinate}, and indeed the distance options provided to \code{ordinate}
+#' are often simply passed on to \code{\link{distance}}.
+#'
+#' A good quick summary of ordination is provided in the introductory vignette
+#' for vegan:
+#'
+#' \href{http://cran.r-project.org/web/packages/vegan/vignettes/intro-vegan.pdf}{vegan introductory vignette}
+#'
+#' The following \code{R} task views are also useful for understanding the
+#' available tools in \code{R}:
+#'
+#' \href{http://cran.r-project.org/web/views/Environmetrics.html}{Analysis of Ecological and Environmental Data}
+#'
+#' \href{http://cran.r-project.org/web/views/Multivariate.html}{Multivariate Statistics}
+#'
+#' @importFrom vegan decorana
+#' @importFrom vegan metaMDS
+#' @importFrom vegan wisconsin
+#' @importFrom vegan decostand
+#' @importFrom ape pcoa
+#' @export
+#' @examples
+#' # See http://joey711.github.io/phyloseq/plot_ordination-examples
+#' # for many more examples.
+#' # plot_ordination(GP, ordinate(GP, "DCA"), "samples", color="SampleType")
+ordinate = function(physeq, method="DCA", distance="bray", formula=NULL, ...){
+ # If `physeq` is a formula, post deprecated notice, attempt to convert and dispatch
+ if( inherits(physeq, "formula") ){
+ .Deprecated(msg=paste0("First argument, `physeq`, as formula is deprecated.\n",
+ "There is now an explicit `formula` argument.\n",
+ "Please revise method call accordingly."))
+ # Create the new formula, RHS-only
+ formchar = as.character(physeq)
+ # Error if only RHS. Formula-first syntax required both sides.
+ if(length(formchar) < 3){
+ stop("Need both sides of formula in this deprecated syntax... Revisit ordinate() documentation / examples.")
+ }
+ # Replace with (presumed) phyloseq object.
+ physeq <- get(as.character(physeq)[2])
+ # Create the new formula, RHS-only.
+ newFormula = as.formula(paste0("~", formchar[length(formchar)]))
+ # Dispatch to (hopefully) ordinate,phyloseq
+ return(ordinate(physeq, method=method, distance=distance, formula=newFormula, ...))
+ }
+ # Define table of currently-supported methods
+ method_table <- c("DCA", "CCA", "RDA", "CAP", "DPCoA", "NMDS", "MDS", "PCoA")
+ # List supported method names to user, if requested.
+ if( inherits(physeq, "character") ){
+ if( physeq=="help" ){
+ cat("Available arguments to methods:\n")
+ print(c(method_table))
+ cat("Please be exact, partial-matching not supported.\n")
+ cat("Can alternatively provide a custom distance.\n")
+ cat("See:\n help(\"distance\") \n")
+ return()
+ } else if( physeq=="list" ){
+ return(c(method_table))
+ } else {
+ cat("physeq needs to be a phyloseq-class object, \n")
+ cat("or a character string matching \"help\" or \"list\". \n")
+ }
+ }
+ # Final check that `physeq` is a phyloseq or otu_table class
+ if( !inherits(physeq, "phyloseq") & !inherits(physeq, "otu_table") ){
+ stop("Expected a phyloseq object or otu_table object.")
+ }
+ # # Start with methods that don't require
+ # # additional distance calculation. (distance argument ignored)
+ # DCA
+ if( method == "DCA" ){
+ return( decorana(veganifyOTU(physeq), ...) )
+ }
+ # CCA / RDA
+ if( method %in% c("CCA", "RDA") ){
+ return(cca.phyloseq(physeq, formula, method, ...))
+ }
+ # CAP
+ if( method == "CAP" ){
+ # Call/return with do.call
+ return(capscale.phyloseq(physeq, formula, distance, ...))
+ }
+ # DPCoA
+ if( method == "DPCoA" ){
+ return( DPCoA(physeq, ...) )
+ }
+ # # Now resort to methods that do require a separate distance/dist-calc
+ # Define ps.dist. Check the class of distance argument is character or dist
+ if( inherits(distance, "dist") ){
+ ps.dist <- distance
+ } else if( class(distance) == "character" ){
+ # There are some special options for NMDS/metaMDS if distance-method
+ # is supported by vegdist, so check first. If not, just calculate distance
+ vegdist_methods <- c("manhattan", "euclidean", "canberra", "bray",
+ "kulczynski", "jaccard", "gower", "altGower", "morisita", "horn",
+ "mountford", "raup" , "binomial", "chao")
+ # NMDS with vegdist-method to include species
+ if(method == "NMDS" & distance %in% vegdist_methods){
+ return(metaMDS(veganifyOTU(physeq), distance, ...))
+ }
+ # Calculate distance with handoff to distance()
+ ps.dist <- distance(physeq, distance, ...)
+ }
+ # Vanilla MDS/PCoA
+ if( method %in% c("PCoA", "MDS")){
+ return(pcoa(ps.dist))
+ }
+ # NMDS with non-vegdist-method
+ if(method == "NMDS"){
+ return(metaMDS(ps.dist))
+ }
+}
+################################################################################
+#' Calculate Double Principle Coordinate Analysis (DPCoA)
+#' using phylogenetic distance
+#'
+#' Function uses abundance (\code{\link{otu_table-class}}) and
+#' phylogenetic (\code{\link[ape]{phylo}}) components of a
+#' \code{\link{phyloseq-class}} experiment-level object
+#' to perform a
+#' Double Principle Coordinate Analysis (DPCoA), relying heavily on
+#' the underlying (and more general) function, \code{\link[ade4]{dpcoa}}.
+#' The distance object ultimately provided is the square root of the
+#' cophenetic/patristic (\code{\link[ape]{cophenetic.phylo}}) distance
+#' between the species, which is always Euclidean.
+#'
+#' Although this distance is Euclidean, for numerical reasons it
+#' will sometimes look non-Euclidean, and a correction will be performed.
+#' See \code{correction} argument.
+#'
+#' @param physeq (Required). A \code{\link{phyloseq-class}} object
+#' containing, at a minimum, abundance (\code{\link{otu_table-class}}) and
+#' phylogenetic (\code{\link[ape]{phylo}}) components.
+#' As a test, the accessors \code{\link{otu_table}} and \code{\link{phy_tree}}
+#' should return an object without error.
+#'
+#' @param correction (Optional). A function. The function must be
+#' able to take a non-Euclidean \code{\link{dist}}ance object,
+#' and return a new \code{dist}ance object that is Euclidean.
+#' If testing a distance object, try \code{\link[ade4]{is.euclid}}.
+#'
+#'
+#' Although the distance matrix should always be Euclidean, for numerical
+#' reasons it will sometimes appear non-Euclidean and a correction method must
+#' be applied. Two recommended correction methods are
+#' \code{\link[ade4]{cailliez}} and \code{\link[ade4]{lingoes}}.
+#' The default is \code{cailliez},
+#' but not for any particularly special reason. If the
+#' distance matrix is Euclidian, no correction will be
+#' performed, regardless of the value of the \code{correction} argument.
+#'
+#' @param scannf (Optional). Logical. Default is \code{FALSE}. This
+#' is passed directly to \code{\link[ade4]{dpcoa}}, and causes a
+#' barplot of eigenvalues to be created if \code{TRUE}. This is not
+#' included in \code{...} because the default for \code{\link[ade4]{dpcoa}}
+#' is \code{TRUE}, although in many expected situations we would want
+#' to suppress creating the barplot.
+#'
+#' @param ... Additional arguments passed to \code{\link[ade4]{dpcoa}}.
+#'
+#' @return A \code{dpcoa}-class object (see \code{\link[ade4]{dpcoa}}).
+#'
+#' @seealso \code{\link[ade4]{dpcoa}}
+#'
+#' @author Julia Fukuyama \email{julia.fukuyama@@gmail.com}.
+#' Adapted for phyloseq by Paul J. McMurdie.
+#'
+#' @importFrom ape cophenetic.phylo
+#' @importFrom ade4 cailliez
+#' @importFrom ade4 dpcoa
+#' @importFrom ade4 is.euclid
+#' @export
+#' @references
+#' Pavoine, S., Dufour, A.B. and Chessel, D. (2004)
+#' From dissimilarities among species to dissimilarities among communities:
+#' a double principal coordinate analysis.
+#' Journal of Theoretical Biology, 228, 523-537.
+#'
+#' @examples
+#' # # # # # # Esophagus
+#' data(esophagus)
+#' eso.dpcoa <- DPCoA(esophagus)
+#' eso.dpcoa
+#' plot_ordination(esophagus, eso.dpcoa, "samples")
+#' plot_ordination(esophagus, eso.dpcoa, "species")
+#' plot_ordination(esophagus, eso.dpcoa, "biplot")
+#' #
+#' #
+#' # # # # # # GlobalPatterns
+#' data(GlobalPatterns)
+#' # subset GP to top-150 taxa (to save computation time in example)
+#' keepTaxa <- names(sort(taxa_sums(GlobalPatterns), TRUE)[1:150])
+#' GP <- prune_taxa(keepTaxa, GlobalPatterns)
+#' # Perform DPCoA
+#' GP.dpcoa <- DPCoA(GP)
+#' plot_ordination(GP, GP.dpcoa, color="SampleType")
+DPCoA <- function(physeq, correction=cailliez, scannf=FALSE, ...){
+ # Check that physeq is a phyloseq-class
+ if(!class(physeq)=="phyloseq"){stop("physeq must be phyloseq-class")}
+
+ # Remove any OTUs that are absent from all the samples.
+ physeq <- prune_taxa((taxa_sums(physeq) > 0), physeq)
+
+ # Access components for handing-off
+ OTU <- otu_table(physeq)
+ tree <- phy_tree(physeq)
+
+ # Enforce that OTU is in samples-by-species orientation
+ if(taxa_are_rows(OTU) ){ OTU <- t(OTU) }
+
+ # get the patristic distances between the species from the tree
+ patristicDist <- sqrt(as.dist(cophenetic.phylo(tree)))
+
+ # if the patristic distances are not Euclidean,
+ # then correct them or throw meaningful error.
+ if( !is.euclid(patristicDist) ){
+ patristicDist <- correction(patristicDist)
+
+ # Check that this is now Euclidean.
+ if( !is.euclid(patristicDist) ){
+ stop('Corrected distance still not Euclidean \n',
+ "please provide a different correction method")
+ }
+ }
+
+ # NOTE: the dpcoa function in ade4 requires a data.frame
+ return( dpcoa(data.frame(OTU), patristicDist, scannf, ...) )
+}
+################################################################################
+################################################################################
+# vegan::cca "extension".
+# formula is main input to this function. This complicates signature handling.
+# A new method with a separate name is defined instead.
+#
+# Must transpose the phyloseq otu_table to fit the vegan::cca convention
+# Whether-or-not to transpose needs to be a check, based on the
+# "taxa_are_rows" slot value
+################################################################################
+#' Constrained Correspondence Analysis and Redundancy Analysis.
+#'
+#' This is the internal function that simplifies getting phyloseq data
+#' into the constrained ordination functions,
+#' \code{\link[vegan]{cca}} and \code{\link[vegan]{rda}}.
+#' Unlike \code{\link[phyloseq]{capscale.phyloseq}}, the formula argument
+#' to these methods is optional, and results in an unconstrained ordination.
+#'
+#' @param physeq (Required). Phylogenetic sequencing data
+#' (\code{\link{phyloseq-class}}).
+#' The data on which you want to perform the ordination.
+#'
+#' @param formula (Optional). A \code{\link{formula}},
+#' specifying the contraining variable(s) format,
+#' with variable names corresponding to \code{\link{sample_data}} (RHS)
+#' from within \code{physeq}.
+#'
+#' @param method (Optional). A single \code{\link{character}} string,
+#' specifying \code{"RDA"} or \code{"CCA"}. Default is \code{"CCA"}.
+#'
+#' @param ... (Optional). Additional named arguments passed to
+#' \code{\link[vegan]{capscale}}.
+#'
+#' @return same output as \code{\link[vegan]{cca}}
+#' or \code{\link[vegan]{rda}}, respectively.
+#'
+#' @seealso \code{\link{plot_ordination}},
+#' \code{\link[vegan]{rda}}, \code{\link[vegan]{cca}}
+#'
+#' @aliases cca.phyloseq rda.phyloseq
+#' @rdname cca-rda-phyloseq-methods
+#' @docType methods
+#'
+#' @keywords internal
+#' @examples #
+#' # cca.phyloseq(physeq, formula, method, ...)
+setGeneric("cca.phyloseq", function(physeq, formula=NULL, method="CCA", ...){
+ standardGeneric("cca.phyloseq")
+})
+#' @importFrom vegan cca
+#' @importFrom vegan rda
+#' @aliases cca.phyloseq,phyloseq,formula-method
+#' @rdname cca-rda-phyloseq-methods
+setMethod("cca.phyloseq", signature=c("phyloseq", "formula"),
+function(physeq, formula, method="CCA", ...){
+ data = data.frame(sample_data(physeq, FALSE), stringsAsFactors=FALSE)
+ if( length(data) < 1 ){
+ stop("`physeq` argument must include non-empty `sample_data`")
+ }
+ OTU = veganifyOTU(physeq)
+ # Create new formula. Left-hand side is ignored.
+ formchar = as.character(formula)
+ newFormula = as.formula(paste0("OTU ~ ", formchar[length(formchar)]))
+ # Note that ade4 also has a conflicting "cca" function.
+ # You don't import ade4::cca to avoid the conflict.
+ if(method=="CCA"){
+ return(cca(newFormula, data=data))
+ } else if(method=="RDA"){
+ return(rda(newFormula, data=data))
+ } else {
+ warning("Unsupported `method` argument. Must be 'RDA' or 'CCA'")
+ return(NULL)
+ }
+})
+#' @importFrom vegan cca
+#' @aliases cca.phyloseq,otu_table-method
+#' @rdname cca-rda-phyloseq-methods
+setMethod("cca.phyloseq", signature="otu_table",
+ function(physeq, formula=NULL, method="CCA", ...){
+ # OTU table by itself indicates an unconstrained ordination is requested.
+ # Formula argument is ignored.
+ if(method=="CCA"){
+ return(cca(veganifyOTU(physeq)))
+ } else if(method=="RDA"){
+ return(rda(veganifyOTU(physeq)))
+ } else {
+ warning("Unsupported `method` argument. Must be 'RDA' or 'CCA'")
+ return(NULL)
+ }
+})
+#' @importFrom vegan cca
+#' @aliases cca.phyloseq,phyloseq,NULL-method
+#' @rdname cca-rda-phyloseq-methods
+setMethod("cca.phyloseq", signature=c("phyloseq", "NULL"),
+function(physeq, formula, method="CCA", ...){
+ # Absence of a formula (NULL) indicates unconstrained ordination.
+ # Access otu_table, and dispatch.
+ return(cca.phyloseq(otu_table(physeq), NULL, method, ...))
+})
+################################################################################
+#' Estimate the gap statistic on an ordination result
+#'
+#' This is a wrapper for the \code{\link[cluster]{clusGap}} function,
+#' expecting an ordination result as the main data argument.
+#'
+#' @param ord (Required). An ordination object. The precise class can vary.
+#' Any ordination classes supported internally by the phyloseq package
+#' should work, ultimately by passing to the \code{\link[vegan]{scores}} function
+#' or its internal extensions in phyloseq.
+#' @param axes (Optional). The ordination axes that you want to include.
+#' @param type (Optional). One of \code{"sites"}
+#' (the vegan package label for samples) or
+#' \code{"species"} (the vegan package label for OTUs/taxa).
+#' Default is \code{"sites"}.
+#' @param FUNcluster (Optional). This is passed to \code{\link[cluster]{clusGap}}.
+#' The documentation is copied here for convenience:
+#' a function which accepts as first argument a (data) matrix like \code{x},
+#' second argument, say (the number of desired clusters) \code{k}, where \code{k >= 2},
+#' and returns a list with a component named (or shortened to) cluster
+#' which is a vector of length \code{n = nrow(x)} of integers in \code{1:k}
+#' determining the clustering or grouping of the \code{n} observations.
+#' The default value is the following function, which wraps
+#' partitioning around medoids, \code{\link[cluster]{pam}}:
+#'
+#' \code{function(x, k){list(cluster = pam(x, k, cluster.only=TRUE))}}
+#'
+#' Any function that has these input/output properties (performing a clustering)
+#' will suffice. The more appropriate the clustering method, the better chance
+#' your gap statistic results will be useful.
+#' @param K.max (Optional). A single positive integer value.
+#' It indicates the maximum number of clusters that will be considered.
+#' Value must be at least two.
+#' This is passed to \code{\link[cluster]{clusGap}}.
+#' @param ... (Optional). Additional named parameters
+#' passed on to \code{\link[cluster]{clusGap}}.
+#' For example, the \code{method} argument provides for extensive options
+#' regarding the method by which the ``optimal'' number of clusters
+#' is computed from the gap statistics (and their standard deviations).
+#' See the \code{\link[cluster]{clusGap}} documentation for more details.
+#'
+#' @return
+#' An object of S3 class \code{"clusGap"}, basically a list with components.
+#' See the \code{\link[cluster]{clusGap}} documentation for more details.
+#'
+#' @importFrom vegan scores
+#' @importFrom cluster clusGap
+#' @importFrom cluster pam
+#' @export
+#' @examples
+#' data("soilrep")
+#' sord = ordinate(soilrep, "PCoA", "bray")
+#' # Evaluate axes with scree plot
+#' plot_scree(sord)
+#' # Gap Statistic
+#' gs = gapstat_ord(sord, axes=1:3, verbose=FALSE)
+#' # plot_ordination(soilrep, sord, color="Treatment")
+#' plot_clusgap(gs)
+#' print(gs, method="Tibs2001SEmax")
+gapstat_ord = function(ord, axes=c(1:2), type="sites",
+ FUNcluster=function(x, k){list(cluster = pam(x, k, cluster.only=TRUE))},
+ K.max=8, ...){
+ #
+ # Use the scores function to get the ordination coordinates
+ x = scores(ord, display=type)
+ # If axes not explicitly defined (NULL), then use all of them
+ if(is.null(axes)){
+ axes = 1:ncol(x)
+ }
+ # Finally, perform, and return, the gap statistic calculation using
+ # cluster::clusGap
+ return(clusGap(x[, axes], FUNcluster, K.max, ...))
+}
+################################################################################
+# Define an internal function for accessing and orienting the OTU table
+# in a fashion suitable for vegan functions
+# @keywords internal
+veganifyOTU <- function(physeq){
+ if(taxa_are_rows(physeq)){physeq <- t(physeq)}
+ return(as(otu_table(physeq), "matrix"))
+}
+################################################################################
+#' Constrained Analysis of Principal Coordinates, \code{\link[vegan]{capscale}}.
+#'
+#' See \code{\link[vegan]{capscale}} for details. A formula is main input.
+#'
+#' @param physeq (Required). Phylogenetic sequencing data
+#' (\code{\link{phyloseq-class}}).
+#' The data on which you want to perform the ordination.
+#'
+#' @param formula (Required). A \code{\link{formula}}, specifying the input.
+#' No need to directly access components. \code{capscale.phyloseq} understands
+#' where to find the abundance table (LHS) and \code{\link{sample_data}} (RHS)
+#' from within the phyloseq object.
+#'
+#' @param distance (Required). A \code{\link{character}} string, specifying
+#' the name of the dissimilarity (or distance) method supported by
+#' the phyloseq \code{\link[phyloseq]{distance}} function.
+#' Alternatively, a pre-computed \code{\link{dist}}-object can be provided here,
+#' in which case it supersedes any use of the \code{\link{otu_table}}
+#' in your phyloseq object.
+#'
+#' Note that \code{\link[vegan]{capscale}}
+#' with Euclidean distances will be identical to \code{\link[vegan]{rda}}
+#' in eigenvalues and in site, species, and biplot scores
+#' (except for possible sign reversal). However, it makes no sense to use
+#' \code{\link[vegan]{capscale}} with Euclidean distances,
+#' since direct use of \code{\link[vegan]{rda}} is much more efficient
+#' (and supported in the \code{\link{ordinate}} function with \code{method=="RDA"})
+#' Even with non-Euclidean dissimilarities,
+#' the rest of the analysis will be metric and linear.
+#'
+#' @param ... (Optional). Additional named arguments passed to
+#' \code{\link[vegan]{capscale}}.
+#'
+#' @return Ordination object defined by \code{\link[vegan]{capscale}}.
+#'
+#' @seealso
+#' \code{\link{plot_ordination}}
+#'
+#' \code{\link[vegan]{rda}}
+#'
+#' \code{\link[vegan]{capscale}}
+#'
+#' @aliases capscale.phyloseq
+#' @rdname capscale-phyloseq-methods
+#' @docType methods
+#' @importFrom vegan capscale
+#' @keywords internal
+#' @examples
+#' # See other examples at
+#' # http://joey711.github.io/phyloseq/plot_ordination-examples
+#' data(GlobalPatterns)
+#' GP = prune_taxa(names(sort(taxa_sums(GlobalPatterns), TRUE)[1:50]), GlobalPatterns)
+#' ordcap = ordinate(GP, "CAP", "bray", ~SampleType)
+#' plot_ordination(GP, ordcap, "samples", color="SampleType")
+setGeneric("capscale.phyloseq", function(physeq, formula, distance, ...){
+ data = data.frame(sample_data(physeq, FALSE), stringsAsFactors=FALSE)
+ if( length(data) < 1 ){
+ stop("`physeq` argument must include non-empty `sample_data`")
+ }
+ standardGeneric("capscale.phyloseq")
+})
+#' @importFrom vegan capscale
+#' @aliases capscale.phyloseq,phyloseq,formula,dist-method
+#' @rdname capscale-phyloseq-methods
+setMethod("capscale.phyloseq", c("phyloseq", "formula", "dist"),
+function(physeq, formula, distance, ...){
+ data = data.frame(sample_data(physeq), stringsAsFactors=FALSE)
+ # Convert formula to character vector, compute on language.
+ formchar = as.character(formula)
+ newFormula = as.formula(paste0("distance ~ ", formchar[length(formchar)]))
+ return(capscale(formula=newFormula, data=data, ...))
+})
+#' @importFrom vegan capscale
+#' @aliases capscale.phyloseq,phyloseq,formula,character-method
+#' @rdname capscale-phyloseq-methods
+setMethod("capscale.phyloseq", c("phyloseq", "formula", "character"),
+function(physeq, formula, distance, ...){
+ data = data.frame(sample_data(physeq), stringsAsFactors=FALSE)
+ # The goal here is to process the distance identifier string
+ # and dispatch accordingly.
+ if( length(distance) != 1 ){
+ warning("`distance` was unexpected length. \n",
+ " `distance` argument should be a single character string",
+ " or dist matrix. \n",
+ "Attempting to use first element only.")
+ }
+ distance <- distance[1]
+ if(!distance %in% unlist(distanceMethodList)){
+ # distance must be among the supported distance options
+ # (which is a superset of vegdist).
+ stop("The distance method you specified is not supported by phyloseq")
+ }
+ # Convert formula to character vector, compute on language.
+ formchar = as.character(formula)
+ if(distance %in% distanceMethodList$vegdist){
+ # If it is among the vegdist distances, pass it along to vegan::capscale
+ OTU = veganifyOTU(physeq)
+ newFormula = as.formula(paste0("OTU ~ ", formchar[length(formchar)]))
+ return(capscale(formula=newFormula, data=data, distance=distance, ...))
+ } else {
+ # Else calculate the distance matrix here, and dispatch.
+ distance <- distance(physeq=physeq, method=distance, type="samples")
+ return(capscale.phyloseq(physeq, formula, distance, ...))
+ }
+})
+################################################################################
diff --git a/R/otuTable-class.R b/R/otuTable-class.R
new file mode 100644
index 0000000..f202de9
--- /dev/null
+++ b/R/otuTable-class.R
@@ -0,0 +1,145 @@
+################################################################################
+#' Build or access the otu_table.
+#'
+#' This is the suggested method for both constructing and accessing
+#' Operational Taxonomic Unit (OTU) abundance (\code{\link{otu_table-class}}) objects.
+#' When the first
+#' argument is a matrix, otu_table() will attempt to create and return an
+#' otu_table-class object,
+#' which further depends on whether or not \code{taxa_are_rows} is provided as an
+#' additional argument.
+#' Alternatively, if the first argument is an experiment-level (\code{\link{phyloseq-class}})
+#' object, then the corresponding \code{otu_table} is returned.
+#'
+#' @usage otu_table(object, taxa_are_rows, errorIfNULL=TRUE)
+#'
+#' @param object (Required). An integer matrix, \code{\link{otu_table-class}},
+#' or \code{\link{phyloseq-class}}.
+#'
+#' @param taxa_are_rows (Conditionally optional). Logical; of length 1. Ignored
+#' unless \code{object} is a matrix, in which case it is is required.
+#'
+#' @param errorIfNULL (Optional). Logical. Should the accessor stop with
+#' an error if the slot is empty (\code{NULL})? Default \code{TRUE}. Ignored
+#' if \code{object} argument is a matrix (constructor invoked instead).
+#'
+#' @return An \code{\link{otu_table-class}} object.
+#'
+#' @seealso \code{\link{phy_tree}}, \code{\link{sample_data}}, \code{\link{tax_table}}
+#' \code{\link{phyloseq}}, \code{\link{merge_phyloseq}}
+#'
+#' @docType methods
+#' @rdname otu_table-methods
+#' @export
+#' @examples #
+#' # data(GlobalPatterns)
+#' # otu_table(GlobalPatterns)
+setGeneric("otu_table", function(object, taxa_are_rows, errorIfNULL=TRUE){
+ standardGeneric("otu_table")
+})
+# Access the otu_table slot.
+#' @aliases otu_table,phyloseq-method
+#' @rdname otu_table-methods
+setMethod("otu_table", "phyloseq", function(object, errorIfNULL=TRUE){
+ access(object, "otu_table", errorIfNULL)
+})
+# return the otu_table as-is.
+#' @aliases otu_table,otu_table-method
+#' @rdname otu_table-methods
+setMethod("otu_table", "otu_table", function(object, errorIfNULL=TRUE){ return(object) })
+# Instantiate an otu_table from a raw abundance matrix.
+#' @aliases otu_table,matrix-method
+#' @rdname otu_table-methods
+setMethod("otu_table", "matrix", function(object, taxa_are_rows){
+ # instantiate first to check validity
+ otutab <- new("otu_table", object, taxa_are_rows=taxa_are_rows)
+ # Want dummy species/sample index names if missing
+ if(taxa_are_rows){
+ if(is.null(rownames(otutab))){
+ rownames(otutab) <- paste("sp", 1:nrow(otutab), sep="")
+ }
+ if(is.null(colnames(otutab))){
+ colnames(otutab) <- paste("sa", 1:ncol(otutab), sep="")
+ }
+ } else {
+ if(is.null(rownames(otutab))){
+ rownames(otutab) <- paste("sa",1:nrow(otutab),sep="")
+ }
+ if(is.null(colnames(otutab))){
+ colnames(otutab) <- paste("sp",1:ncol(otutab),sep="")
+ }
+ }
+ return(otutab)
+})
+# # # Convert to matrix, then dispatch.
+#' @aliases otu_table,data.frame-method
+#' @rdname otu_table-methods
+setMethod("otu_table", "data.frame", function(object, taxa_are_rows){
+ otu_table(as(object, "matrix"), taxa_are_rows)
+})
+# Any less-specific class, not inherited by those above.
+#' @aliases otu_table,ANY-method
+#' @rdname otu_table-methods
+setMethod("otu_table", "ANY", function(object, errorIfNULL=TRUE){
+ access(object, "otu_table", errorIfNULL)
+})
+################################################################################
+#' Returns the total number of individuals observed from each species/taxa/OTU.
+#'
+#' A convenience function equivalent to rowSums or colSums, but where
+#' the orientation of the otu_table is automatically handled.
+#'
+#' @usage taxa_sums(x)
+#'
+#' @param x \code{\link{otu_table-class}}, or \code{\link{phyloseq-class}}.
+#'
+#' @return A \code{\link{numeric-class}} with length equal to the number of species
+#' in the table, name indicated the taxa ID, and value equal to the sum of
+#' all individuals observed for each taxa in \code{x}.
+#'
+#' @seealso \code{\link{sample_sums}}, \code{\link{rowSums}}, \code{\link{colSums}}
+#' @export
+#' @examples
+#' data(enterotype)
+#' taxa_sums(enterotype)
+#' data(esophagus)
+#' taxa_sums(esophagus)
+taxa_sums <- function(x){
+ x <- otu_table(x)
+ if( taxa_are_rows(x) ){
+ rowSums(x)
+ } else {
+ colSums(x)
+ }
+}
+################################################################################
+#' Returns the total number of individuals observed from each sample.
+#'
+#' A convenience function equivalent to rowSums or colSums, but where
+#' the orientation of the otu_table is automatically handled.
+#'
+#' @usage sample_sums(x)
+#'
+#' @param x \code{\link{otu_table-class}}, or \code{\link{phyloseq-class}}.
+#'
+#' @return A named \code{\link{numeric-class}}
+#' length equal to the number of samples
+#' in the \code{x}, name indicating the sample ID, and value equal to the sum of
+#' all individuals observed for each sample in \code{x}.
+#'
+#' @seealso \code{\link{taxa_sums}}, \code{\link{rowSums}}, \code{\link{colSums}}
+#' @export
+#' @examples
+#' data(enterotype)
+#' sample_sums(enterotype)
+#' data(esophagus)
+#' sample_sums(esophagus)
+sample_sums <- function(x){
+ x <- otu_table(x)
+ if( taxa_are_rows(x) ){
+ colSums(x)
+ } else {
+ rowSums(x)
+ }
+}
+################################################################################
diff --git a/R/phylo-class.R b/R/phylo-class.R
new file mode 100644
index 0000000..ef8e7af
--- /dev/null
+++ b/R/phylo-class.R
@@ -0,0 +1,26 @@
+# Methods related to using phylo in phyloseq, including
+# phyloseq-internal calls to ape internals.
+################################################################################
+#' Method for fixing problems with phylo-class trees in phyloseq
+#'
+#' For now this only entails replacing each missing (\code{NA}) branch-length
+#' value with 0.0.
+#'
+#' @keywords internal
+setGeneric("fix_phylo", function(tree) standardGeneric("fix_phylo") )
+#' @rdname fix_phylo
+#' @aliases fix_phylo,phylo-method
+setMethod("fix_phylo", "phylo", function(tree){
+ tree$edge.length[which(is.na(tree$edge.length))] <- 0
+ return(tree)
+})
+################################################################################
+# Define horizontal position / node-ages by depth to root
+# For instance, `xx` in `plot_tree` and `tipAges` in `fastUniFrac`
+#' @keywords internal
+ape_node_depth_edge_length <- function(Ntip, Nnode, edge, Nedge, edge.length){
+ .C(ape:::node_depth_edgelength, PACKAGE="ape", as.integer(Ntip),
+ as.integer(Nnode), as.integer(edge[, 1]),
+ as.integer(edge[, 2]), as.integer(Nedge),
+ as.double(edge.length), double(Ntip + Nnode))[[7]]
+}
\ No newline at end of file
diff --git a/R/phyloseq-class.R b/R/phyloseq-class.R
new file mode 100644
index 0000000..ad125d2
--- /dev/null
+++ b/R/phyloseq-class.R
@@ -0,0 +1,400 @@
+################################################################################
+#' Build phyloseq-class objects from their components.
+#'
+#' \code{phyloseq()} is a constructor method, This is the main method
+#' suggested for constructing an experiment-level (\code{\link{phyloseq-class}})
+#' object from its component data
+#' (component data classes: \code{\link{otu_table-class}}, \code{\link{sample_data-class}},
+#' \code{\link{taxonomyTable-class}}, \code{\link{phylo-class}}).
+#'
+#' @usage phyloseq(...)
+#'
+#' @param ... One or more component objects among the set of classes
+#' defined by the phyloseq package, as well as \code{phylo}-class
+#' (defined by the \code{\link{ape-package}}). Each argument should be a different class.
+#' For combining multiple components of the same class, or multiple phyloseq-class
+#' objects, use the \code{\link{merge_phyloseq}} function. Unlike in earlier
+#' versions, the arguments to phyloseq do not need to be named, and the order
+#' of the arguments does not matter.
+#'
+#' @return The class of the returned object depends on the argument
+#' class(es). For an experiment-level object, two or more component data objects
+#' must be provided.
+#' Otherwise, if a single component-class
+#' is provided, it is simply returned as-is.
+#' The order of arguments does not matter.
+#'
+#' @seealso \code{\link{merge_phyloseq}}
+#' @export
+#' @examples
+#' data(esophagus)
+#' x1 = phyloseq(otu_table(esophagus), phy_tree(esophagus))
+#' identical(x1, esophagus)
+#' # # data(GlobalPatterns)
+#' # # GP <- GlobalPatterns
+#' # # phyloseq(sample_data(GP), otu_table(GP))
+#' # # phyloseq(otu_table(GP), phy_tree(GP))
+#' # # phyloseq(tax_table(GP), otu_table(GP))
+#' # # phyloseq(phy_tree(GP), otu_table(GP), sample_data(GP))
+#' # # phyloseq(otu_table(GP), tax_table(GP), sample_data(GP))
+#' # # phyloseq(otu_table(GP), phy_tree(GP), tax_table(GP), sample_data(GP))
+phyloseq <- function(...){
+
+ arglist <- list(...)
+
+ # Remove names from arglist. Will replace them based on their class
+ names(arglist) <- NULL
+
+ # ignore all but component data classes.
+ arglist <- arglist[sapply(arglist, is.component.class)]
+
+ # Make the name-replaced, splatted list
+ splatlist <- sapply(arglist, splat.phyloseq.objects)
+
+ # rm any forbidden chars in index names (e.g. quotes - phylogenetic tree).
+ # Right now, only extra quotes are forbidden.
+ splatlist = lapply(splatlist, function(x){
+ taxa_names(x) <- gsub("\"", "", taxa_names(x), fixed=TRUE)
+ taxa_names(x) <- gsub("\'", "", taxa_names(x), fixed=TRUE)
+ return(x)
+ })
+
+ ####################
+ ## Need to determine whether to
+ # (A) instantiate a new raw/uncleaned phyloseq object, or
+ # (B) return a single component, or
+ # (C) to stop with an error because of incorrect argument types.
+ if( length(splatlist) > length(get.component.classes()) ){
+ stop("Too many components provided\n")
+ } else if( length(names(splatlist)) > length(unique(names(splatlist))) ){
+ stop("Only one of each component type allowed.\n",
+ "For merging multiple objects of the same type/class, try merge_phyloseq(...)\n")
+ } else if( length(splatlist) == 1){
+ return(arglist[[1]])
+ } else {
+ # Instantiate the phyloseq-class object, ps.
+ ps <- do.call("new", c(list(Class="phyloseq"), splatlist) )
+ }
+
+ ####################
+ ## Reconcile the taxa and sample index names between components
+ ## in the newly-minted phyloseq object
+ shared_taxa = intersect_taxa(ps)
+ shared_samples = intersect_samples(ps)
+
+ if( length(shared_taxa) < 1 ){
+ stop("Problem with OTU/taxa indices among those you provided.\n",
+ "Check using intersect() and taxa_names()\n"
+ )
+ }
+ if( length(shared_samples) < 1 ){
+ stop("Problem with sample indices among those you provided.\n",
+ "Check using intersect() and taxa_names()\n"
+ )
+ }
+
+ # Start with OTU indices
+ ps = prune_taxa(shared_taxa, ps)
+
+ # Verify there is more than one component
+ # that describes samples before attempting to reconcile.
+ ps = prune_samples(shared_samples, ps)
+
+ # Force both samples and taxa indices to be in the same order.
+ ps = index_reorder(ps, "both")
+
+ # Replace any NA branch-length values in the tree with zero.
+ if( !is.null(phy_tree(ps, FALSE)) ){
+ ps at phy_tree <- fix_phylo(ps at phy_tree)
+ }
+
+ return(ps)
+}
+################################################################################
+# A relatively fast way to access from phyloseq object components
+# f - function name as character string
+# physeq - a phyloseq object (phyloseq-class instance)
+#' @keywords internal
+f_comp_ps = function(f, physeq){
+ sapply(names(getSlots("phyloseq")), function(i, ps){
+ eval(parse(text=paste(f, "(ps@", i, ")", sep="")))
+ }, physeq)
+}
+# f_comp_ps("taxa_names", ps)
+# f_comp_ps("ntaxa", ps)
+# Reduce("union", f_comp_ps("taxa_names", ps))
+# Reduce("intersect", f_comp_ps("taxa_names", ps))
+################################################################################
+#' Show the component objects classes and slot names.
+#'
+#' There are no arguments to this function. It returns a named character
+#' when called, which can then be used for tests of component data types, etc.
+#'
+#' @usage get.component.classes()
+#'
+#' @return a character vector of the component objects classes, where each
+#' element is named by the corresponding slot name in the phyloseq-class.
+#'
+#' @keywords internal
+#'
+#' @examples #
+#' #get.component.classes()
+get.component.classes <- function(){
+ # define classes vector
+ component.classes <- c("otu_table", "sample_data", "phylo", "taxonomyTable", "XStringSet")
+ # the names of component.classes needs to be the slot names to match getSlots / splat
+ names(component.classes) <- c("otu_table", "sam_data", "phy_tree", "tax_table", "refseq")
+ return(component.classes)
+}
+# Explicitly define components/slots that describe taxa.
+#' @keywords internal
+taxa.components = function(){
+ # define classes vector
+ component.classes <- c("otu_table", "phylo", "taxonomyTable", "XStringSet")
+ # the names of component.classes needs to be the slot names to match getSlots / splat
+ names(component.classes) <- c("otu_table", "phy_tree", "tax_table", "refseq")
+ return(component.classes)
+}
+# Explicitly define components/slots that describe samples.
+#' @keywords internal
+sample.components = function(){
+ # define classes vector
+ component.classes <- c("otu_table", "sample_data")
+ # the names of component.classes needs to be the slot names to match getSlots / splat
+ names(component.classes) <- c("otu_table", "sam_data")
+ return(component.classes)
+}
+# Returns TRUE if x is a component class, FALSE otherwise.
+# This shows up over and over again in data infrastructure
+#' @keywords internal
+is.component.class = function(x){
+ inherits(x, get.component.classes())
+}
+################################################################################
+#' Convert \code{\link{phyloseq-class}} into a named list of its non-empty components.
+#'
+#' This is used in internal handling functions, and one of its key features
+#' is that the names in the returned-list match the slot-names, which is useful
+#' for constructing calls with language-computing functions like \code{\link{do.call}}.
+#' Another useful aspect is that it only returns the contents of non-empty slots.
+#' In general, this should only be used by phyloseq-package developers. Standard
+#' users should not need or use this function, and should use the accessors and
+#' other tools that leave the multi-component object in one piece.
+#'
+#' @usage splat.phyloseq.objects(x)
+#'
+#' @param x A \code{\link{phyloseq-class}} object. Alternatively, a component
+#' data object will work, resulting in named list of length 1.
+#'
+#' @return A named list, where each element is a component object that was contained
+#' in the argument, \code{x}. Each element is named according to its slot-name in
+#' the phyloseq-object from which it is derived.
+#' If \code{x} is already a component data object,
+#' then a list of length (1) is returned, also named.
+#'
+#' @seealso merge_phyloseq
+#' @keywords internal
+#' @examples #
+splat.phyloseq.objects <- function(x){
+ if( is.component.class(x) ){
+ # Check if class of x is among the component classes already (not phyloseq-class)
+ splatx <- list(x)
+ names(splatx) <- names(which(sapply(get.component.classes(), function(cclass, x) inherits(x, cclass), x)))
+ } else if( inherits(x, "phyloseq") ){
+ # Else, check if it inherits from phyloseq, and if-so splat
+ slotnames = names(getSlots("phyloseq"))
+ allslots = sapply(slotnames, function(i, x){access(x, i, FALSE)}, x)
+ splatx = allslots[!sapply(allslots, is.null)]
+ } else {
+ # Otherwise, who knows what it is, silently return NULL.
+ return(NULL)
+ }
+ return(splatx)
+}
+################################################################################
+#' Return the non-empty slot names of a phyloseq object.
+#'
+#' Like \code{\link{getSlots}}, but returns the class name if argument
+#' is component data object.
+#'
+#' @usage getslots.phyloseq(physeq)
+#'
+#' @param physeq A \code{\link{phyloseq-class}} object. If \code{physeq} is a component
+#' data class, then just returns the class of \code{physeq}.
+#'
+#' @return identical to getSlots. A named character vector of the slot classes
+#' of a particular S4 class, where each element is named by the slot name it
+#' represents. If \code{physeq} is a component data object,
+#' then a vector of length (1) is returned, named according to its slot name in
+#' the \code{\link{phyloseq-class}}.
+#'
+#' @seealso merge_phyloseq
+#' @export
+#' @examples #
+#' data(GlobalPatterns)
+#' getslots.phyloseq(GlobalPatterns)
+#' data(esophagus)
+#' getslots.phyloseq(esophagus)
+getslots.phyloseq = function(physeq){
+ names(splat.phyloseq.objects(physeq))
+}
+################################################################################
+#' Universal slot accessor function for phyloseq-class.
+#'
+#' This function is used internally by many accessors and in
+#' many functions/methods that need to access a particular type of component data.
+#' If something is wrong, or the slot is missing, the expected behavior is that
+#' this function will return NULL. Thus, the output can be tested by
+#' \code{\link{is.null}} as verification of the presence of a particular
+#' data component. Unlike the component-specific accessors (e.g. \code{\link{otu_table}},
+#' or \code{\link{phy_tree}}),
+#' the default behavior is not to stop with an error if the desired slot is empty.
+#' In all cases this is controlled by the \code{errorIfNULL} argument, which can
+#' be set to \code{TRUE} if an error is desired.
+#'
+#' @usage access(physeq, slot, errorIfNULL=FALSE)
+#'
+#' @param physeq (Required). \code{\link{phyloseq-class}}.
+#'
+#' @param slot (Required). A character string indicating the slot (not data class)
+#' of the component data type that is desired.
+#'
+#' @param errorIfNULL (Optional). Logical. Should the accessor stop with
+#' an error if the slot is empty (\code{NULL})? Default \code{FALSE}.
+#'
+#' @return Returns the component object specified by the argument \code{slot}.
+#' Returns NULL if slot does not exist. Returns \code{physeq} as-is
+#' if it is a component class that already matches the slot name.
+#'
+#' @seealso \code{\link{getslots.phyloseq}}, \code{\link{merge_phyloseq}}
+#' @export
+#' @examples #
+#' ## data(GlobalPatterns)
+#' ## access(GlobalPatterns, "tax_table")
+#' ## access(GlobalPatterns, "phy_tree")
+#' ## access(otu_table(GlobalPatterns), "otu_table")
+#' ## # Should return NULL:
+#' ## access(otu_table(GlobalPatterns), "sample_data")
+#' ## access(otuTree(GlobalPatterns), "sample_data")
+#' ## access(otuSam(GlobalPatterns), "phy_tree")
+access <- function(physeq, slot, errorIfNULL=FALSE){
+ if( is.component.class(physeq) ){
+ # If physeq is a component class, might return as-is. Depends on slot.
+ if( inherits(physeq, get.component.classes()[slot]) ){
+ # if slot-name matches, return physeq as-is.
+ out = physeq
+ } else {
+ # If slot/component mismatch, set out to NULL. Test later if this is an error.
+ out = NULL
+ }
+ } else if(!slot %in% slotNames(physeq) ){
+ # If slot is invalid, set out to NULL. Test later if this is an error.
+ out = NULL
+ } else {
+ # By elimination, must be valid. Access slot
+ out = eval(parse(text=paste("physeq@", slot, sep="")))
+ }
+ if( errorIfNULL & is.null(out) ){
+ # Only error regarding a NULL return value if errorIfNULL is TRUE.
+ stop(slot, " slot is empty.")
+ }
+ return(out)
+}
+################################################################################
+#' Returns the intersection of species and samples for the components of x
+#'
+#' This function is used internally as part of the infrastructure to ensure that
+#' component data types in a phyloseq-object have exactly the same taxa/species.
+#' It relies heavily on the \code{\link{Reduce}} function to determine the
+#' strictly common species.
+#'
+#' @usage intersect_taxa(x)
+#'
+#' @param x (Required). A \code{\link{phyloseq-class}} object
+#' that contains 2 or more components
+#' that in-turn describe species/taxa.
+#'
+#' @return Returns a character vector of only those species that are present in
+#' all species-describing components of \code{x}.
+#'
+#' @seealso \code{\link{Reduce}}, \code{\link{intersect}}
+#' @keywords internal
+#' @examples #
+#' ## data(GlobalPatterns)
+#' ## head(intersect_taxa(GlobalPatterns), 10)
+intersect_taxa <- function(x){
+ taxa_vectors = f_comp_ps("taxa_names", x)
+ taxa_vectors = taxa_vectors[!sapply(taxa_vectors, is.null)]
+ return( Reduce("intersect", taxa_vectors) )
+}
+#' @keywords internal
+intersect_samples <- function(x){
+ sample_vectors = f_comp_ps("sample_names", x)
+ sample_vectors = sample_vectors[!sapply(sample_vectors, is.null)]
+ return( Reduce("intersect", sample_vectors) )
+}
+################################################################################
+#' Force index order of phyloseq objects
+#'
+#' @usage index_reorder(ps, index_type)
+#'
+#' @param ps (Required). A \code{\link{phyloseq-class}} instance.
+#' @param index_type (Optional). A character string
+#' specifying the indices to properly order.
+#' Supported values are \code{c("both", "taxa", "samples")}.
+#' Default is \code{"both"}, meaning samples and taxa indices
+#' will be checked/re-ordered.
+#'
+#' @keywords internal
+#' @docType methods
+#'
+#' @examples
+#' ## data("GlobalPatterns")
+#' ## GP = index_reorder(GlobalPatterns)
+setGeneric("index_reorder", function(ps, index_type) standardGeneric("index_reorder") )
+#' @rdname index_reorder
+#' @aliases index_reorder,phyloseq-method
+setMethod("index_reorder", "phyloseq", function(ps, index_type="both"){
+ if( index_type %in% c("both", "taxa") ){
+ ## ENFORCE CONSISTENT ORDER OF TAXA INDICES.
+ if( !is.null(phy_tree(ps, FALSE)) ){
+ # If there is a phylogenetic tree included,
+ # re-order based on that, and reorder the otu_table
+ # The new taxa order, torder, will also trickle down to
+ # the taxonomyTable or XStringSet if present.
+ torder = taxa_names(phy_tree(ps))
+ # Re-order the OTU table
+ if( taxa_are_rows(ps) ){
+ ps at otu_table = otu_table(ps)[torder, ]
+ } else {
+ ps at otu_table = otu_table(ps)[, torder]
+ }
+ } else {
+ # Else, re-order anything/everything else based on the OTU-table order
+ torder = taxa_names(otu_table(ps))
+ }
+ if( !is.null(tax_table(ps, FALSE)) ){
+ # If there is a taxonomyTable, re-order that too.
+ ps at tax_table = tax_table(ps)[torder, ]
+ }
+ if( !is.null(refseq(ps, FALSE)) ){
+ # If there is a XStringSet, re-order that too.
+ ps at refseq = refseq(ps)[torder]
+ }
+ }
+
+ if( index_type %in% c("both", "samples") ){
+ ## ENFORCE CONSISTENT ORDER OF SAMPLE INDICES
+ # Errors can creep when sample indices do not match.
+ if( !is.null(sample_data(ps, FALSE)) ){
+ # check first that ps has sample_data
+ if( !all(sample_names(otu_table(ps)) == rownames(sample_data(ps))) ){
+ # Reorder the sample_data rows so that they match the otu_table order.
+ ps at sam_data <- sample_data(ps)[sample_names(otu_table(ps)), ]
+ }
+ }
+ }
+
+ return(ps)
+})
+################################################################################
\ No newline at end of file
diff --git a/R/plot-methods.R b/R/plot-methods.R
new file mode 100644
index 0000000..191d1b5
--- /dev/null
+++ b/R/plot-methods.R
@@ -0,0 +1,2857 @@
+#
+# extension of plot methods for phyloseq object.
+#
+################################################################################
+#' Generic plot defaults for phyloseq.
+#'
+#' There are many useful examples of phyloseq graphics functions in the
+#' \href{http://joey711.github.io/phyloseq}{phyloseq online tutorials}.
+#' The specific plot type is chosen according to available non-empty slots.
+#' This is mainly for syntactic convenience and quick-plotting. See links below
+#' for some examples of available graphics tools available in the
+#' \code{\link{phyloseq-package}}.
+#'
+#' @usage plot_phyloseq(physeq, ...)
+#'
+#' @param physeq (Required). \code{\link{phyloseq-class}}. The actual plot type
+#' depends on the available (non-empty) component data types contained within.
+#'
+#' @param ... (Optional). Additional parameters to be passed on to the respective
+#' specific plotting function. See below for different plotting functions that
+#' might be called by this generic plotting wrapper.
+#'
+#' @return A plot is created. The nature and class of the plot depends on
+#' the \code{physeq} argument, specifically, which component data classes
+#' are present.
+#'
+#' @seealso
+#' \href{http://joey711.github.io/phyloseq/tutorials-index.html}{phyloseq frontpage tutorials}.
+#'
+#' \code{\link{plot_ordination}}
+#' \code{\link{plot_heatmap}}
+#' \code{\link{plot_tree}}
+#' \code{\link{plot_network}}
+#' \code{\link{plot_bar}}
+#' \code{\link{plot_richness}}
+#'
+#' @export
+#' @docType methods
+#' @rdname plot_phyloseq-methods
+#'
+#' @examples
+#' data(esophagus)
+#' plot_phyloseq(esophagus)
+setGeneric("plot_phyloseq", function(physeq, ...){ standardGeneric("plot_phyloseq") })
+#' @aliases plot_phyloseq,phyloseq-method
+#' @rdname plot_phyloseq-methods
+setMethod("plot_phyloseq", "phyloseq", function(physeq, ...){
+ if( all(c("otu_table", "sample_data", "phy_tree") %in% getslots.phyloseq(physeq)) ){
+ plot_tree(esophagus, color="samples")
+ } else if( all(c("otu_table", "sample_data", "tax_table") %in% getslots.phyloseq(physeq) ) ){
+ plot_bar(physeq, ...)
+ } else if( all(c("otu_table", "phy_tree") %in% getslots.phyloseq(physeq)) ){
+ plot_tree(esophagus, color="samples")
+ } else {
+ plot_richness(physeq)
+ }
+})
+################################################################################
+# For simplicity, the most common ggplot2 dependency functions/objects
+# will be imported only here.
+# Less-common functions will be listed in the roxygen header above those functions
+# but rarely will these common imports be re-listed elsewhere in other plot_ functions,
+# even though it is often good practice to do so.
+################################################################################
+#' Microbiome Network Plot using ggplot2
+#'
+#' There are many useful examples of phyloseq network graphics in the
+#' \href{http://joey711.github.io/phyloseq/plot_network-examples}{phyloseq online tutorials}.
+#' A custom plotting function for displaying networks
+#' using advanced \code{\link[ggplot2]{ggplot}}2 formatting.
+#' The network itself should be represented using
+#' the \code{igraph} package.
+#' For the \code{\link{phyloseq-package}} it is suggested that the network object
+#' (argument \code{g})
+#' be created using the
+#' \code{\link{make_network}} function,
+#' and based upon sample-wise or taxa-wise microbiome ecological distances
+#' calculated from a phylogenetic sequencing experiment
+#' (\code{\link{phyloseq-class}}).
+#' In this case, edges in the network are created if the distance between
+#' nodes is below a potentially arbitrary threshold,
+#' and special care should be given to considering the choice of this threshold.
+#'
+#' @usage plot_network(g, physeq=NULL, type="samples",
+#' color=NULL, shape=NULL, point_size=4, alpha=1,
+#' label="value", hjust = 1.35,
+#' line_weight=0.5, line_color=color, line_alpha=0.4,
+#' layout.method=layout.fruchterman.reingold, title=NULL)
+#'
+#' @param g (Required). An \code{igraph}-class object created
+#' either by the convenience wrapper \code{\link{make_network}},
+#' or directly by the tools in the igraph-package.
+#'
+#' @param physeq (Optional). Default \code{NULL}.
+#' A \code{\link{phyloseq-class}} object on which \code{g} is based.
+#'
+#' @param type (Optional). Default \code{"samples"}.
+#' Whether the network represented in the primary argument, \code{g},
+#' is samples or taxa/OTUs.
+#' Supported arguments are \code{"samples"}, \code{"taxa"},
+#' where \code{"taxa"} indicates using the taxa indices,
+#' whether they actually represent species or some other taxonomic rank.
+#'
+#' @param color (Optional). Default \code{NULL}.
+#' The name of the sample variable in \code{physeq} to use for color mapping
+#' of points (graph vertices).
+#'
+#' @param shape (Optional). Default \code{NULL}.
+#' The name of the sample variable in \code{physeq} to use for shape mapping.
+#' of points (graph vertices).
+#'
+#' @param point_size (Optional). Default \code{4}.
+#' The size of the vertex points.
+#'
+#' @param alpha (Optional). Default \code{1}.
+#' A value between 0 and 1 for the alpha transparency of the vertex points.
+#'
+#' @param label (Optional). Default \code{"value"}.
+#' The name of the sample variable in \code{physeq} to use for
+#' labelling the vertex points.
+#'
+#' @param hjust (Optional). Default \code{1.35}.
+#' The amount of horizontal justification to use for each label.
+#'
+#' @param line_weight (Optional). Default \code{0.3}.
+#' The line thickness to use to label graph edges.
+#'
+#' @param line_color (Optional). Default \code{color}.
+#' The name of the sample variable in \code{physeq} to use for color mapping
+#' of lines (graph edges).
+#'
+#' @param line_alpha (Optional). Default \code{0.4}.
+#' The transparency level for graph-edge lines.
+#'
+#' @param layout.method (Optional). Default \code{layout.fruchterman.reingold}.
+#' A function (closure) that determines the placement of the vertices
+#' for drawing a graph. Should be able to take an \code{igraph}-class
+#' as sole argument, and return a two-column coordinate matrix with \code{nrow}
+#' equal to the number of vertices. For possible options already included in
+#' \code{igraph}-package, see the others also described in the help file:
+#'
+#' @param title (Optional). Default \code{NULL}. Character string.
+#' The main title for the graphic.
+#'
+#' \code{\link[igraph]{layout.fruchterman.reingold}}
+#'
+#' @return A \code{\link{ggplot}}2 plot representing the network,
+#' with optional mapping of variable(s) to point color or shape.
+#'
+#' @seealso
+#' \code{\link{make_network}}
+#'
+#' @references
+#' This code was adapted from a repo original hosted on GitHub by Scott Chamberlain:
+#' \url{https://github.com/SChamberlain/gggraph}
+#'
+#' The code most directly used/modified was first posted here:
+#' \url{http://www.r-bloggers.com/basic-ggplot2-network-graphs/}
+#'
+#'
+#' @import reshape2
+#' @importFrom igraph layout.fruchterman.reingold
+#' @importFrom igraph get.edgelist
+#' @importFrom igraph get.vertex.attribute
+#' @importFrom igraph vcount
+#' @importFrom ggplot2 ggplot
+#' @importFrom ggplot2 aes_string
+#' @importFrom ggplot2 aes
+#' @importFrom ggplot2 geom_point
+#' @importFrom ggplot2 geom_text
+#' @importFrom ggplot2 geom_line
+#' @importFrom ggplot2 geom_path
+#' @importFrom ggplot2 theme
+#' @importFrom ggplot2 theme_bw
+#' @importFrom ggplot2 element_blank
+#' @importFrom ggplot2 ggtitle
+#'
+#' @export
+#' @examples
+#'
+#' data(enterotype)
+#' ig <- make_network(enterotype, max.dist=0.3)
+#' plot_network(ig, enterotype, color="SeqTech", shape="Enterotype", line_weight=0.3, label=NULL)
+#' # Change distance parameter
+#' ig <- make_network(enterotype, max.dist=0.2)
+#' plot_network(ig, enterotype, color="SeqTech", shape="Enterotype", line_weight=0.3, label=NULL)
+plot_network <- function(g, physeq=NULL, type="samples",
+ color=NULL, shape=NULL, point_size=4, alpha=1,
+ label="value", hjust = 1.35,
+ line_weight=0.5, line_color=color, line_alpha=0.4,
+ layout.method=layout.fruchterman.reingold, title=NULL){
+
+ if( vcount(g) < 2 ){
+ # Report a warning if the graph is empty
+ stop("The graph you provided, `g`, has too few vertices.
+ Check your graph, or the output of `make_network` and try again.")
+ }
+
+ # disambiguate species/OTU/taxa as argument type...
+ if( type %in% c("taxa", "species", "OTUs", "otus", "otu") ){
+ type <- "taxa"
+ }
+
+ # Make the edge-coordinates data.frame
+ edgeDF <- data.frame(get.edgelist(g))
+ edgeDF$id <- 1:length(edgeDF[, 1])
+
+ # Make the vertices-coordinates data.frame
+ vertDF <- layout.method(g)
+ colnames(vertDF) <- c("x", "y")
+ vertDF <- data.frame(value=get.vertex.attribute(g, "name"), vertDF)
+
+ # If phyloseq object provided,
+ # AND it has the relevant additional data
+ # THEN add it to vertDF
+ if( !is.null(physeq) ){
+ extraData <- NULL
+ if( type == "samples" & !is.null(sample_data(physeq, FALSE)) ){
+ extraData = data.frame(sample_data(physeq))[as.character(vertDF$value), , drop=FALSE]
+ } else if( type == "taxa" & !is.null(tax_table(physeq, FALSE)) ){
+ extraData = data.frame(tax_table(physeq))[as.character(vertDF$value), , drop=FALSE]
+ }
+ # Only mod vertDF if extraData exists
+ if( !is.null(extraData) ){
+ vertDF <- data.frame(vertDF, extraData)
+ }
+ }
+
+ # Combine vertex and edge coordinate data.frames
+ graphDF <- merge(reshape2::melt(edgeDF, id="id"), vertDF, by = "value")
+
+ # Initialize the ggplot
+ p <- ggplot(vertDF, aes(x, y))
+
+ # Strip all the typical annotations from the plot, leave the legend
+ p <- p + theme_bw() +
+ theme(
+ panel.grid.major = element_blank(),
+ panel.grid.minor = element_blank(),
+ axis.text.x = element_blank(),
+ axis.text.y = element_blank(),
+ axis.title.x = element_blank(),
+ axis.title.y = element_blank(),
+ axis.ticks = element_blank(),
+ panel.border = element_blank()
+ )
+
+ # Add the graph vertices as points
+ p <- p + geom_point(aes_string(color=color, shape=shape), size=point_size, na.rm=TRUE)
+
+ # Add the text labels
+ if( !is.null(label) ){
+ p <- p + geom_text(aes_string(label=label), size = 2, hjust=hjust, na.rm=TRUE)
+ }
+
+ # Add the edges:
+ p <- p + geom_line(aes_string(group="id", color=line_color),
+ graphDF, size=line_weight, alpha=line_alpha, na.rm=TRUE)
+
+ # Optionally add a title to the plot
+ if( !is.null(title) ){
+ p <- p + ggtitle(title)
+ }
+
+ return(p)
+}
+################################################################################
+#' Microbiome Network Plot using ggplot2
+#'
+#' There are many useful examples of phyloseq network graphics in the
+#' \href{http://joey711.github.io/phyloseq/plot_net-examples}{phyloseq online tutorials}.
+#' A custom plotting function for displaying networks
+#' using advanced \code{\link[ggplot2]{ggplot}}2 formatting.
+#' Note that this function is a performance and interface revision to
+#' \code{\link{plot_network}}, which requires an \code{\link[igraph]{igraph}}
+#' object as its first argument.
+#' This new function is more in-line with other
+#' \code{plot_*} functions in the \code{\link{phyloseq-package}}, in that its
+#' first/main argument is a \code{\link{phyloseq-class}} instance.
+#' Edges in the network are created if the distance between
+#' nodes is below a (potentially arbitrary) threshold,
+#' and special care should be given to considering the choice of this threshold.
+#' However, network line thickness and opacity is scaled according to the
+#' similarity of vertices (either samples or taxa),
+#' helping to temper, somewhat, the effect of the threshold.
+#' Also note that the choice of network layout algorithm can have a large effect
+#' on the impression and interpretability of the network graphic,
+#' and you may want to familiarize yourself with some of these options
+#' (see the \code{laymeth} argument).
+#'
+#' @param physeq (Required).
+#' The \code{\link{phyloseq-class}} object that you want to represent as a network.
+#'
+#' @param distance (Optional). Default is \code{"bray"}.
+#' Can be either a distance method supported by \code{\link[phyloseq]{distance}},
+#' or an already-computed \code{\link{dist}}-class with labels that match
+#' the indices implied by both the \code{physeq} and \code{type} arguments
+#' (that is, either sample or taxa names).
+#' If you used \code{\link[phyloseq]{distance}} to pre-calculate your \code{\link{dist}}ance,
+#' and the same \code{type} argument as provided here, then they will match.
+#'
+#' @param maxdist (Optional). Default \code{0.7}.
+#' The maximum distance value between two vertices
+#' to connect with an edge in the graphic.
+#'
+#' @param type (Optional). Default \code{"samples"}.
+#' Whether the network represented in the primary argument, \code{g},
+#' is samples or taxa/OTUs.
+#' Supported arguments are \code{"samples"}, \code{"taxa"},
+#' where \code{"taxa"} indicates using the taxa indices,
+#' whether they actually represent species or some other taxonomic rank.
+#'
+#' @param laymeth (Optional). Default \code{"fruchterman.reingold"}.
+#' A character string that indicates the method that will determine
+#' the placement of vertices, typically based on conectedness of vertices
+#' and the number of vertices.
+#' This is an interesting topic, and there are lots of options.
+#' See \code{\link{igraph-package}} for related topics in general,
+#' and see \code{\link[igraph]{layout.auto}} for descriptions of various
+#' alternative layout method options supported here.
+#' The character string argument should match exactly the
+#' layout function name with the \code{"layout."} omitted.
+#' Try \code{laymeth="list"} to see a list of options.
+#'
+#' @param color (Optional). Default \code{NULL}.
+#' The name of the sample variable in \code{physeq} to use for color mapping
+#' of points (graph vertices).
+#'
+#' @param shape (Optional). Default \code{NULL}.
+#' The name of the sample variable in \code{physeq} to use for shape mapping.
+#' of points (graph vertices).
+#'
+#' @param rescale (Optional). Logical. Default \code{FALSE}.
+#' Whether to rescale the distance values to be \code{[0, 1]}, in which the
+#' min value is close to zero and the max value is 1.
+#'
+#' @param point_size (Optional). Default \code{5}.
+#' The size of the vertex points.
+#'
+#' @param point_alpha (Optional). Default \code{1}.
+#' A value between 0 and 1 for the alpha transparency of the vertex points.
+#'
+#' @param point_label (Optional). Default \code{NULL}.
+#' The variable name in \code{physeq} covariate data to map to vertex labels.
+#'
+#' @param hjust (Optional). Default \code{1.35}.
+#' The amount of horizontal justification to use for each label.
+#'
+#' @param title (Optional). Default \code{NULL}. Character string.
+#' The main title for the graphic.
+#'
+#' @return A \code{\link{ggplot}}2 network plot.
+#' Will render to default graphic device automatically as print side effect.
+#' Can also be saved, further manipulated, or rendered to
+#' a vector or raster file using \code{\link{ggsave}}.
+#'
+#' @seealso
+#' Original network plotting functions:
+#'
+#' \code{\link{make_network}}
+#'
+#' \code{\link{plot_network}}
+#'
+#'
+#' @import reshape2
+#'
+#' @importFrom data.table data.table
+#' @importFrom data.table copy
+#'
+#' @importFrom igraph layout.auto
+#' @importFrom igraph layout.random
+#' @importFrom igraph layout.circle
+#' @importFrom igraph layout.sphere
+#' @importFrom igraph layout.fruchterman.reingold
+#' @importFrom igraph layout.kamada.kawai
+#' @importFrom igraph layout.spring
+#' @importFrom igraph layout.reingold.tilford
+#' @importFrom igraph layout.fruchterman.reingold.grid
+#' @importFrom igraph layout.lgl
+#' @importFrom igraph layout.graphopt
+#' @importFrom igraph layout.svd
+#' @importFrom igraph graph.data.frame
+#' @importFrom igraph get.vertex.attribute
+#'
+#' @importFrom ggplot2 geom_segment
+#' @importFrom ggplot2 scale_alpha
+#' @importFrom ggplot2 scale_size
+#'
+#' @export
+#' @examples
+#' data(enterotype)
+#' plot_net(enterotype, color="SeqTech", maxdist = 0.3)
+#' plot_net(enterotype, color="SeqTech", maxdist = 0.3, laymeth = "auto")
+#' plot_net(enterotype, color="SeqTech", maxdist = 0.3, laymeth = "svd")
+#' plot_net(enterotype, color="SeqTech", maxdist = 0.3, laymeth = "circle")
+#' plot_net(enterotype, color="SeqTech", shape="Enterotype", maxdist = 0.3, laymeth = "circle")
+plot_net <- function(physeq, distance="bray", type="samples", maxdist = 0.7,
+ laymeth="fruchterman.reingold", color=NULL, shape=NULL, rescale=FALSE,
+ point_size=5, point_alpha=1, point_label=NULL, hjust = 1.35, title=NULL){
+ # Supported layout methods
+ available_layouts = list(
+ auto = layout.auto,
+ random = layout.random,
+ circle = layout.circle,
+ sphere = layout.sphere,
+ fruchterman.reingold = layout.fruchterman.reingold,
+ kamada.kawai = layout.kamada.kawai,
+ spring = layout.spring,
+ reingold.tilford = layout.reingold.tilford,
+ fruchterman.reingold.grid = layout.fruchterman.reingold.grid,
+ lgl = layout.lgl,
+ graphopt = layout.graphopt,
+ svd = layout.svd
+ )
+ if(laymeth=="list"){
+ return(names(available_layouts))
+ }
+ if(!laymeth %in% names(available_layouts)){
+ stop("Unsupported argument to `laymeth` option.
+ Please use an option returned by `plot_net(laymeth='list')`")
+ }
+ # 1.
+ # Calculate Distance
+ if( inherits(distance, "dist") ){
+ # If distance a distance object, use it rather than re-calculate
+ Distance <- distance
+ # Check that it at least has (a subset of) the correct labels
+ possibleVertexLabels = switch(type, taxa=taxa_names(physeq), samples=sample_names(physeq))
+ if( !all(attributes(distance)$Labels %in% possibleVertexLabels) ){
+ stop("Some or all `distance` index labels do not match ", type, " names in `physeq`")
+ }
+ } else {
+ # Coerce to character and attempt distance calculation
+ scaled_distance = function(physeq, method, type, rescale=TRUE){
+ Dist = distance(physeq, method, type)
+ if(rescale){
+ # rescale the distance matrix to be [0, 1]
+ Dist <- Dist / max(Dist, na.rm=TRUE)
+ Dist <- Dist - min(Dist, na.rm=TRUE)
+ }
+ return(Dist)
+ }
+ distance <- as(distance[1], "character")
+ Distance = scaled_distance(physeq, distance, type, rescale)
+ }
+ # 2.
+ # Create edge data.table
+ dist_to_edge_table = function(Dist, MaxDistance=NULL, vnames = c("v1", "v2")){
+ dmat <- as.matrix(Dist)
+ # Set duplicate entries and self-links to Inf
+ dmat[upper.tri(dmat, diag = TRUE)] <- Inf
+ LinksData = data.table(reshape2::melt(dmat, varnames=vnames, as.is = TRUE))
+ setnames(LinksData, old = "value", new = "Distance")
+ # Remove self-links and duplicate links
+ LinksData <- LinksData[is.finite(Distance), ]
+ # Remove entries above the threshold, MaxDistance
+ if(!is.null(MaxDistance)){
+ LinksData <- LinksData[Distance < MaxDistance, ]
+ }
+ return(LinksData)
+ }
+ LinksData0 = dist_to_edge_table(Distance, maxdist)
+ # 3. Create vertex layout
+ # Make the vertices-coordinates data.table
+ vertex_layout = function(LinksData, physeq=NULL, type="samples",
+ laymeth=igraph::layout.fruchterman.reingold, ...){
+ # `physeq` can be anything, only has effect when non-NULL returned by sample_data or tax_table
+ g = igraph::graph.data.frame(LinksData, directed=FALSE)
+ vertexDT = data.table(laymeth(g, ...),
+ vertex=get.vertex.attribute(g, "name"))
+ setkeyv(vertexDT, "vertex")
+ setnames(vertexDT, old = c(1, 2), new = c("x", "y"))
+ extraData = NULL
+ if( type == "samples" & !is.null(sample_data(physeq, FALSE)) ){
+ extraData <- data.table(data.frame(sample_data(physeq)), key = "rn", keep.rownames = TRUE)
+ } else if( type == "taxa" & !is.null(tax_table(physeq, FALSE)) ){
+ extraData <- data.table(as(tax_table(physeq), "matrix"), key = "rn", keep.rownames = TRUE)
+ }
+ # Only mod vertexDT if extraData exists
+ if(!is.null(extraData)){
+ # Join vertexDT, extraData by vertex
+ setnames(extraData, old = "rn", new = "vertex")
+ setkeyv(vertexDT, "vertex")
+ setkeyv(extraData, "vertex")
+ vertexDT <- copy(vertexDT[extraData])
+ vertexDT <- vertexDT[!is.na(x), ]
+ }
+ return(vertexDT)
+ }
+ vertexDT = vertex_layout(LinksData0, physeq, type, available_layouts[[laymeth]])
+ # 4.
+ # Update the links layout for ggplot: x, y, xend, yend
+ link_layout = function(LinksData, vertexDT){
+ linkstart = copy(vertexDT[LinksData$v1, x, y])
+ linkend = copy(vertexDT[LinksData$v2, x, y])
+ setnames(linkend, old = c("y", "x"), new = c("yend", "xend"))
+ LinksData <- copy(cbind(LinksData, linkstart, linkend))
+ return(LinksData)
+ }
+ LinksData = link_layout(LinksData0, vertexDT)
+ # 5.
+ # Define ggplot2 network plot
+ p = ggplot(data=LinksData) +
+ geom_segment(mapping = aes(x, y,
+ xend = xend,
+ yend = yend,
+ size = Distance,
+ alpha = Distance)) +
+ geom_point(mapping = aes_string(x="x", y="y",
+ color = color,
+ shape = shape),
+ data = vertexDT,
+ size = point_size,
+ alpha = point_alpha,
+ na.rm = TRUE) +
+ scale_alpha(range = c(1, 0.1)) +
+ scale_size(range = c(2, 0.25))
+ # Add labels
+ if(!is.null(point_label)){
+ p <- p + geom_text(aes_string(x="x", y="y", label=point_label),
+ data = vertexDT, size = 2, hjust = hjust, na.rm = TRUE)
+ }
+ # Add default theme
+ net_theme = theme(
+ panel.grid.major = element_blank(),
+ panel.grid.minor = element_blank(),
+ axis.text.x = element_blank(),
+ axis.text.y = element_blank(),
+ axis.title.x = element_blank(),
+ axis.title.y = element_blank(),
+ axis.ticks = element_blank(),
+ panel.border = element_blank()
+ )
+ p <- p + theme_bw() + net_theme
+ return(p)
+}
+################################################################################
+#' Plot alpha diversity, flexibly with ggplot2
+#'
+#' There are many useful examples of alpha-diversity graphics in the
+#' \href{http://joey711.github.io/phyloseq/plot_richness-examples}{phyloseq online tutorials}.
+#' This function estimates a number of alpha-diversity metrics using the
+#' \code{\link{estimate_richness}} function,
+#' and returns a \code{ggplot} plotting object.
+#' The plot generated by this function will include every sample
+#' in \code{physeq}, but they can be further grouped on the horizontal axis
+#' through the argument to \code{x},
+#' and shaded according to the argument to \code{color} (see below).
+#' You must use untrimmed, non-normalized count data for meaningful results,
+#' as many of these estimates are highly dependent on the number of singletons.
+#' You can always trim the data later on if needed,
+#' just not before using this function.
+#'
+#' NOTE: Because this plotting function incorporates the output from
+#' \code{\link{estimate_richness}}, the variable names of that output should
+#' not be used as \code{x} or \code{color} (even if it works, the resulting
+#' plot might be kindof strange, and not the intended behavior of this function).
+#' The following are the names you will want to avoid using in \code{x} or \code{color}:
+#'
+#' \code{c("Observed", "Chao1", "ACE", "Shannon", "Simpson", "InvSimpson", "Fisher")}.
+#'
+#' @param physeq (Required). \code{\link{phyloseq-class}}, or alternatively,
+#' an \code{\link{otu_table-class}}. The data about which you want to estimate.
+#'
+#' @param x (Optional). A variable to map to the horizontal axis. The vertical
+#' axis will be mapped to the alpha diversity index/estimate
+#' and have units of total taxa, and/or index value (dimensionless).
+#' This parameter (\code{x}) can be either a character string indicating a
+#' variable in \code{sample_data}
+#' (among the set returned by \code{sample_variables(physeq)} );
+#' or a custom supplied vector with length equal to the number of samples
+#' in the dataset (nsamples(physeq)).
+#'
+#' The default value is \code{"samples"}, which will map each sample's name
+#' to a separate horizontal position in the plot.
+#'
+#' @param color (Optional). Default \code{NULL}.
+#' The sample variable to map to different colors.
+#' Like \code{x}, this can be a single character string of the variable name in
+#' \code{sample_data}
+#' (among the set returned by \code{sample_variables(physeq)} );
+#' or a custom supplied vector with length equal to the number of samples
+#' in the dataset (nsamples(physeq)).
+#' The color scheme is chosen automatically by \code{link{ggplot}},
+#' but it can be modified afterward with an additional layer using
+#' \code{\link[ggplot2]{scale_color_manual}}.
+#'
+#' @param shape (Optional). Default \code{NULL}. The sample variable to map
+#' to different shapes. Like \code{x} and \code{color},
+#' this can be a single character string
+#' of the variable name in
+#' \code{sample_data}
+#' (among the set returned by \code{sample_variables(physeq)} );
+#' or a custom supplied vector with length equal to the number of samples
+#' in the dataset (nsamples(physeq)).
+#' The shape scale is chosen automatically by \code{link{ggplot}},
+#' but it can be modified afterward with an additional layer using
+#' \code{\link[ggplot2]{scale_shape_manual}}.
+#'
+#' @param title (Optional). Default \code{NULL}. Character string.
+#' The main title for the graphic.
+#'
+#' @param scales (Optional). Default \code{"free_y"}.
+#' Whether to let vertical axis have free scale that adjusts to
+#' the data in each panel.
+#' This argument is passed to \code{\link[ggplot2]{facet_wrap}}.
+#' If set to \code{"fixed"}, a single vertical scale will
+#' be used in all panels. This can obscure values if the
+#' \code{measures} argument includes both
+#' richness estimates and diversity indices, for example.
+#'
+#' @param nrow (Optional). Default is \code{1},
+#' meaning that all plot panels will be placed in a single row,
+#' side-by-side.
+#' This argument is passed to \code{\link[ggplot2]{facet_wrap}}.
+#' If \code{NULL}, the number of rows and columns will be
+#' chosen automatically (wrapped) based on the number of panels
+#' and the size of the graphics device.
+#'
+#' @param shsi (Deprecated). No longer supported. Instead see `measures` below.
+#'
+#' @param measures (Optional). Default is \code{NULL}, meaning that
+#' all available alpha-diversity measures will be included in plot panels.
+#' Alternatively, you can specify one or more measures
+#' as a character vector of measure names.
+#' Values must be among those supported:
+#' \code{c("Observed", "Chao1", "ACE", "Shannon", "Simpson", "InvSimpson", "Fisher")}.
+#'
+#' @param sortby (Optional). A character string subset of \code{measures} argument.
+#' Sort x-indices by the mean of one or more \code{measures},
+#' if x-axis is mapped to a discrete variable.
+#' Default is \code{NULL}, implying that a discrete-value horizontal axis
+#' will use default sorting, usually alphabetic.
+#'
+#' @return A \code{\link{ggplot}} plot object summarizing
+#' the richness estimates, and their standard error.
+#'
+#' @seealso
+#' \code{\link{estimate_richness}}
+#'
+#' \code{\link[vegan]{estimateR}}
+#'
+#' \code{\link[vegan]{diversity}}
+#'
+#' There are many more interesting examples at the
+#' \href{http://joey711.github.io/phyloseq/plot_richness-examples}{phyloseq online tutorials}.
+#'
+#'
+#' @import reshape2
+#'
+#' @importFrom plyr is.discrete
+#'
+#' @importFrom ggplot2 geom_errorbar
+#' @importFrom ggplot2 facet_wrap
+#' @importFrom ggplot2 element_text
+#'
+#' @export
+#' @examples
+#' ## There are many more interesting examples at the phyloseq online tutorials.
+#' ## http://joey711.github.io/phyloseq/plot_richness-examples
+#' data("soilrep")
+#' plot_richness(soilrep, measures=c("InvSimpson", "Fisher"))
+#' plot_richness(soilrep, "Treatment", "warmed", measures=c("Chao1", "ACE", "InvSimpson"), nrow=3)
+#' data("GlobalPatterns")
+#' plot_richness(GlobalPatterns, x="SampleType", measures=c("InvSimpson"))
+#' plot_richness(GlobalPatterns, x="SampleType", measures=c("Chao1", "ACE", "InvSimpson"), nrow=3)
+#' plot_richness(GlobalPatterns, x="SampleType", measures=c("Chao1", "ACE", "InvSimpson"), nrow=3, sortby = "Chao1")
+plot_richness = function(physeq, x="samples", color=NULL, shape=NULL, title=NULL,
+ scales="free_y", nrow=1, shsi=NULL, measures=NULL, sortby=NULL){
+ # Calculate the relevant alpha-diversity measures
+ erDF = estimate_richness(physeq, split=TRUE, measures=measures)
+ # Measures may have been renamed in `erDF`. Replace it with the name from erDF
+ measures = colnames(erDF)
+ # Define "measure" variables and s.e. labels, for melting.
+ ses = colnames(erDF)[grep("^se\\.", colnames(erDF))]
+ # Remove any S.E. from `measures`
+ measures = measures[!measures %in% ses]
+ # Make the plotting data.frame.
+ # This coerces to data.frame, required for reliable output from reshape2::melt()
+ if( !is.null(sample_data(physeq, errorIfNULL=FALSE)) ){
+ # Include the sample data, if it is there.
+ DF <- data.frame(erDF, sample_data(physeq))
+ } else {
+ # If no sample data, leave it out.
+ DF <- data.frame(erDF)
+ }
+ if( !"samples" %in% colnames(DF) ){
+ # If there is no "samples" variable in DF, add it
+ DF$samples <- sample_names(physeq)
+ }
+ # sample_names used to be default, and should also work.
+ # #backwardcompatibility
+ if( !is.null(x) ){
+ if( x %in% c("sample", "samples", "sample_names", "sample.names") ){
+ x <- "samples"
+ }
+ } else {
+ # If x was NULL for some reason, set it to "samples"
+ x <- "samples"
+ }
+ # melt to display different alpha-measures separately
+ mdf = reshape2::melt(DF, measure.vars=measures)
+ # Initialize the se column. Helpful even if not used.
+ mdf$se <- NA_integer_
+ if( length(ses) > 0 ){
+ ## Merge s.e. into one "se" column
+ # Define conversion vector, `selabs`
+ selabs = ses
+ # Trim the "se." from the names
+ names(selabs) <- substr(selabs, 4, 100)
+ # Make first letter of selabs' names uppercase
+ substr(names(selabs), 1, 1) <- toupper(substr(names(selabs), 1, 1))
+ # use selabs conversion vector to process `mdf`
+ mdf$wse <- sapply(as.character(mdf$variable), function(i, selabs){selabs[i]}, selabs)
+ for( i in 1:nrow(mdf) ){
+ if( !is.na(mdf[i, "wse"]) ){
+ mdf[i, "se"] <- mdf[i, (mdf[i, "wse"])]
+ }
+ }
+ # prune the redundant columns
+ mdf <- mdf[, -which(colnames(mdf) %in% c(selabs, "wse"))]
+ }
+ ## Interpret measures
+ # If not provided (default), keep all
+ if( !is.null(measures) ){
+ if( any(measures %in% as.character(mdf$variable)) ){
+ # If any measures were in mdf, then subset to just those.
+ mdf <- mdf[as.character(mdf$variable) %in% measures, ]
+ } else {
+ # Else, print warning about bad option choice for measures, keeping all.
+ warning("Argument to `measures` not supported. All alpha-diversity measures (should be) included in plot.")
+ }
+ }
+ if( !is.null(shsi) ){
+ # Deprecated:
+ # If shsi is anything but NULL, print a warning about its being deprecated
+ warning("shsi no longer supported option in plot_richness. Please use `measures` instead")
+ }
+ # Address `sortby` argument
+ if(!is.null(sortby)){
+ if(!all(sortby %in% levels(mdf$variable))){
+ warning("`sortby` argument not among `measures`. Ignored.")
+ }
+ if(!is.discrete(mdf[, x])){
+ warning("`sortby` argument provided, but `x` not a discrete variable. `sortby` is ignored.")
+ }
+ if(all(sortby %in% levels(mdf$variable)) & is.discrete(mdf[, x])){
+ # Replace x-factor with same factor that has levels re-ordered according to `sortby`
+ wh.sortby = which(mdf$variable %in% sortby)
+ mdf[, x] <- factor(mdf[, x],
+ levels = names(sort(tapply(X = mdf[wh.sortby, "value"],
+ INDEX = mdf[wh.sortby, x],
+ mean,
+ na.rm=TRUE, simplify = TRUE))))
+ }
+ }
+ # Define variable mapping
+ richness_map = aes_string(x=x, y="value", colour=color, shape=shape)
+ # Make the ggplot.
+ p = ggplot(mdf, richness_map) + geom_point(na.rm=TRUE)
+ # Add error bars if mdf$se is not all NA
+ if( any(!is.na(mdf[, "se"])) ){
+ p = p + geom_errorbar(aes(ymax=value + se, ymin=value - se), width=0.1)
+ }
+ # Rotate horizontal axis labels, and adjust
+ p = p + theme(axis.text.x=element_text(angle=-90, vjust=0.5, hjust=0))
+ # Add y-label
+ p = p + ylab('Alpha Diversity Measure')
+ # Facet wrap using user-options
+ p = p + facet_wrap(~variable, nrow=nrow, scales=scales)
+ # Optionally add a title to the plot
+ if( !is.null(title) ){
+ p <- p + ggtitle(title)
+ }
+ return(p)
+}
+################################################################################
+# The general case, could plot samples, taxa, or both (biplot/split). Default samples.
+################################################################################
+#' General ordination plotter based on ggplot2.
+#'
+#' There are many useful examples of phyloseq ordination graphics in the
+#' \href{http://joey711.github.io/phyloseq/plot_ordination-examples}{phyloseq online tutorials}.
+#' Convenience wrapper for plotting ordination results as a
+#' \code{ggplot2}-graphic, including
+#' additional annotation in the form of shading, shape, and/or labels of
+#' sample variables.
+#'
+#' @param physeq (Required). \code{\link{phyloseq-class}}.
+#' The data about which you want to
+#' plot and annotate the ordination.
+#'
+#' @param ordination (Required). An ordination object. Many different classes
+#' of ordination are defined by \code{R} packages. Ordination classes
+#' currently supported/created by the \code{\link{ordinate}} function are
+#' supported here. There is no default, as the expectation is that the
+#' ordination will be performed and saved prior to calling this plot function.
+#'
+#' @param type (Optional). The plot type. Default is \code{"samples"}. The
+#' currently supported options are
+#' \code{c("samples", "sites", "species", "taxa", "biplot", "split", "scree")}.
+#' The option
+#' ``taxa'' is equivalent to ``species'' in this case, and similarly,
+#' ``samples'' is equivalent to ``sites''.
+#' The options
+#' \code{"sites"} and \code{"species"} result in a single-plot of just the
+#' sites/samples or species/taxa of the ordination, respectively.
+#' The \code{"biplot"} and \code{"split"} options result in a combined
+#' plot with both taxa and samples, either combined into one plot (``biplot'')
+#' or
+#' separated in two facet panels (``split''), respectively.
+#' The \code{"scree"} option results in a call to \code{\link{plot_scree}},
+#' which produces an ordered bar plot of the normalized eigenvalues
+#' associated with each ordination axis.
+#'
+#' @param axes (Optional). A 2-element vector indicating the axes of the
+#' ordination that should be used for plotting.
+#' Can be \code{\link{character-class}} or \code{\link{integer-class}},
+#' naming the index name or index of the desired axis for the horizontal
+#' and vertical axes, respectively, in that order. The default value,
+#' \code{c(1, 2)}, specifies the first two axes of the provided ordination.
+#'
+#' @param color (Optional). Default \code{NULL}. Character string.
+#' The name of the variable to map to
+#' colors in the plot.
+#' This can be a sample variable
+#' (among the set returned by \code{sample_variables(physeq)} )
+#' or
+#' taxonomic rank
+#' (among the set returned by \code{rank_names(physeq)}).
+#'
+#' Note that the color scheme is chosen automatically
+#' by \code{link{ggplot}},
+#' but it can be modified afterward with an additional layer using
+#' \code{\link[ggplot2]{scale_color_manual}}.
+#'
+#' @param shape (Optional). Default \code{NULL}. Character string.
+#' The name of the variable to map
+#' to different shapes on the plot.
+#' Similar to \code{color} option, but for the shape if points.
+#'
+#' The shape scale is chosen automatically by \code{link{ggplot}},
+#' but it can be modified afterward with an additional layer using
+#' \code{\link[ggplot2]{scale_shape_manual}}.
+#'
+#' @param label (Optional). Default \code{NULL}. Character string.
+#' The name of the variable to map to text labels on the plot.
+#' Similar to \code{color} option, but for plotting text.
+#'
+#' @param title (Optional). Default \code{NULL}. Character string.
+#' The main title for the graphic.
+#'
+#' @param justDF (Optional). Default \code{FALSE}. Logical.
+#' Instead of returning a ggplot2-object, do you just want the relevant
+#' \code{data.frame} that was used to build the plot? This is a
+#' user-accessible option for obtaining the \code{data.frame}, in
+#' in principal to make a custom plot that isn't possible with the
+#' available options in this function. For contributing new functions
+#' (developers), the
+#' \code{\link{phyloseq-package}} provides/uses an internal function
+#' to build the key features of the \code{data.frame} prior to plot-build.
+#'
+#' @return A \code{\link{ggplot}} plot object, graphically summarizing
+#' the ordination result for the specified axes.
+#'
+#' @seealso
+#' Many more examples are included in the
+#' \href{http://joey711.github.io/phyloseq/plot_ordination-examples}{phyloseq online tutorials}.
+#'
+#' Also see the general wrapping function:
+#'
+#' \code{\link{plot_phyloseq}}
+#'
+#'
+#' @importFrom vegan wascores
+#'
+#' @importFrom ggplot2 facet_wrap
+#' @importFrom ggplot2 update_labels
+#' @importFrom ggplot2 scale_size_manual
+#' @importFrom ggplot2 xlab
+#' @importFrom ggplot2 ylab
+#'
+#' @export
+#'
+#' @examples
+#' # See other examples at
+#' # http://joey711.github.io/phyloseq/plot_ordination-examples
+#' data(GlobalPatterns)
+#' GP = prune_taxa(names(sort(taxa_sums(GlobalPatterns), TRUE)[1:50]), GlobalPatterns)
+#' gp_bray_pcoa = ordinate(GP, "CCA", "bray")
+#' plot_ordination(GP, gp_bray_pcoa, "samples", color="SampleType")
+plot_ordination = function(physeq, ordination, type="samples", axes=1:2,
+ color=NULL, shape=NULL, label=NULL, title=NULL, justDF=FALSE){
+ if(length(type) > 1){
+ warning("`type` can only be a single option,
+ but more than one provided. Using only the first.")
+ type <- type[[1]]
+ }
+ if(length(color) > 1){
+ warning("The `color` variable argument should have length equal to 1.",
+ "Taking first value.")
+ color = color[[1]][1]
+ }
+ if(length(shape) > 1){
+ warning("The `shape` variable argument should have length equal to 1.",
+ "Taking first value.")
+ shape = shape[[1]][1]
+ }
+ if(length(label) > 1){
+ warning("The `label` variable argument should have length equal to 1.",
+ "Taking first value.")
+ label = label[[1]][1]
+ }
+ official_types = c("sites", "species", "biplot", "split", "scree")
+ if(!inherits(physeq, "phyloseq")){
+ if(inherits(physeq, "character")){
+ if(physeq=="list"){
+ return(official_types)
+ }
+ }
+ warning("Full functionality requires `physeq` be phyloseq-class ",
+ "with multiple components.")
+ }
+ # Catch typos and synonyms
+ type = gsub("^.*site[s]*.*$", "sites", type, ignore.case=TRUE)
+ type = gsub("^.*sample[s]*.*$", "sites", type, ignore.case=TRUE)
+ type = gsub("^.*species.*$", "species", type, ignore.case=TRUE)
+ type = gsub("^.*taxa.*$", "species", type, ignore.case=TRUE)
+ type = gsub("^.*OTU[s]*.*$", "species", type, ignore.case=TRUE)
+ type = gsub("^.*biplot[s]*.*$", "biplot", type, ignore.case=TRUE)
+ type = gsub("^.*split[s]*.*$", "split", type, ignore.case=TRUE)
+ type = gsub("^.*scree[s]*.*$", "scree", type, ignore.case=TRUE)
+ # If type argument is not supported...
+ if( !type %in% official_types ){
+ warning("type argument not supported. `type` set to 'samples'.\n",
+ "See `plot_ordination('list')`")
+ type <- "sites"
+ }
+ if( type %in% c("scree") ){
+ # Stop early by passing to plot_scree() if "scree" was chosen as a type
+ return( plot_scree(ordination, title=title) )
+ }
+ # Define a function to check if a data.frame is empty
+ is_empty = function(x){
+ length(x) < 2 | suppressWarnings(all(is.na(x)))
+ }
+ # The plotting data frames.
+ # Call scores to get coordinates.
+ # Silently returns only the coordinate systems available.
+ # e.g. sites-only, even if species requested.
+ specDF = siteDF = NULL
+ trash1 = try({siteDF <- scores(ordination, choices = axes,
+ display="sites", physeq=physeq)},
+ silent = TRUE)
+ trash2 = try({specDF <- scores(ordination, choices = axes,
+ display="species", physeq=physeq)},
+ silent = TRUE)
+ # Check that have assigned coordinates to the correct object
+ siteSampIntx = length(intersect(rownames(siteDF), sample_names(physeq)))
+ siteTaxaIntx = length(intersect(rownames(siteDF), taxa_names(physeq)))
+ specSampIntx = length(intersect(rownames(specDF), sample_names(physeq)))
+ specTaxaIntx = length(intersect(rownames(specDF), taxa_names(physeq)))
+ if(siteSampIntx < specSampIntx & specTaxaIntx < siteTaxaIntx){
+ # Double-swap
+ co = specDF
+ specDF <- siteDF
+ siteDF <- co
+ rm(co)
+ } else {
+ if(siteSampIntx < specSampIntx){
+ # Single swap
+ siteDF <- specDF
+ specDF <- NULL
+ }
+ if(specTaxaIntx < siteTaxaIntx){
+ # Single swap
+ specDF <- siteDF
+ siteDF <- NULL
+ }
+ }
+ # If both empty, warn and return NULL
+ if(is_empty(siteDF) & is_empty(specDF)){
+ warning("Could not obtain coordinates from the provided `ordination`. \n",
+ "Please check your ordination method, and whether it is supported by `scores` or listed by phyloseq-package.")
+ return(NULL)
+ }
+ # If either is missing, do weighted average
+ if(is_empty(specDF) & type != "sites"){
+ message("Species coordinates not found directly in ordination object. Attempting weighted average (`vegan::wascores`)")
+ specDF <- data.frame(wascores(siteDF, w = veganifyOTU(physeq)), stringsAsFactors=FALSE)
+ }
+ if(is_empty(siteDF) & type != "species"){
+ message("Species coordinates not found directly in ordination object. Attempting weighted average (`vegan::wascores`)")
+ siteDF <- data.frame(wascores(specDF, w = t(veganifyOTU(physeq))), stringsAsFactors=FALSE)
+ }
+ # Double-check that have assigned coordinates to the correct object
+ specTaxaIntx <- siteSampIntx <- NULL
+ siteSampIntx <- length(intersect(rownames(siteDF), sample_names(physeq)))
+ specTaxaIntx <- length(intersect(rownames(specDF), taxa_names(physeq)))
+ if(siteSampIntx < 1L & !is_empty(siteDF)){
+ # If siteDF is not empty, but it doesn't intersect the sample_names in physeq, warn and set to NULL
+ warning("`Ordination site/sample coordinate indices did not match `physeq` index names. Setting corresponding coordinates to NULL.")
+ siteDF <- NULL
+ }
+ if(specTaxaIntx < 1L & !is_empty(specDF)){
+ # If specDF is not empty, but it doesn't intersect the taxa_names in physeq, warn and set to NULL
+ warning("`Ordination species/OTU/taxa coordinate indices did not match `physeq` index names. Setting corresponding coordinates to NULL.")
+ specDF <- NULL
+ }
+ # If you made it this far and both NULL, return NULL and throw a warning
+ if(is_empty(siteDF) & is_empty(specDF)){
+ warning("Could not obtain coordinates from the provided `ordination`. \n",
+ "Please check your ordination method, and whether it is supported by `scores` or listed by phyloseq-package.")
+ return(NULL)
+ }
+ if(type %in% c("biplot", "split") & (is_empty(siteDF) | is_empty(specDF)) ){
+ # biplot and split require both coordinates systems available.
+ # Both were attempted, or even evaluated by weighted average.
+ # If still empty, warn and switch to relevant type.
+ if(is_empty(siteDF)){
+ warning("Could not access/evaluate site/sample coordinates. Switching type to 'species'")
+ type <- "species"
+ }
+ if(is_empty(specDF)){
+ warning("Could not access/evaluate species/taxa/OTU coordinates. Switching type to 'sites'")
+ type <- "sites"
+ }
+ }
+ if(type != "species"){
+ # samples covariate data frame, `sdf`
+ sdf = NULL
+ sdf = data.frame(access(physeq, slot="sam_data"), stringsAsFactors=FALSE)
+ if( !is_empty(sdf) & !is_empty(siteDF) ){
+ # The first two axes should always be x and y, the ordination axes.
+ siteDF <- cbind(siteDF, sdf[rownames(siteDF), ])
+ }
+ }
+ if(type != "sites"){
+ # taxonomy data frame `tdf`
+ tdf = NULL
+ tdf = data.frame(access(physeq, slot="tax_table"), stringsAsFactors=FALSE)
+ if( !is_empty(tdf) & !is_empty(specDF) ){
+ # The first two axes should always be x and y, the ordination axes.
+ specDF = cbind(specDF, tdf[rownames(specDF), ])
+ }
+ }
+ # In "naked" OTU-table cases, `siteDF` or `specDF` could be matrix.
+ if(!inherits(siteDF, "data.frame")){
+ siteDF <- as.data.frame(siteDF, stringsAsFactors = FALSE)
+ }
+ if(!inherits(specDF, "data.frame")){
+ specDF <- as.data.frame(specDF, stringsAsFactors = FALSE)
+ }
+ # Define the main plot data frame, `DF`
+ DF = NULL
+ DF <- switch(EXPR = type, sites = siteDF, species = specDF, {
+ # Anything else. In practice, type should be "biplot" or "split" here.
+ # Add id.type label
+ specDF$id.type <- "Taxa"
+ siteDF$id.type <- "Samples"
+ # But what if the axis variables differ b/w them?
+ # Coerce specDF to match samples (siteDF) axis names
+ colnames(specDF)[1:2] <- colnames(siteDF)[1:2]
+ # Merge the two data frames together for joint plotting.
+ DF = merge(specDF, siteDF, all=TRUE)
+ # Replace NA with "samples" or "taxa", where appropriate (factor/character)
+ if(!is.null(shape)){ DF <- rp.joint.fill(DF, shape, "Samples") }
+ if(!is.null(shape)){ DF <- rp.joint.fill(DF, shape, "Taxa") }
+ if(!is.null(color)){ DF <- rp.joint.fill(DF, color, "Samples") }
+ if(!is.null(color)){ DF <- rp.joint.fill(DF, color, "Taxa") }
+ DF
+ })
+ # In case user wants the plot-DF for some other purpose, return early
+ if(justDF){return(DF)}
+ # Check variable availability before defining mapping.
+ if(!is.null(color)){
+ if(!color %in% names(DF)){
+ warning("Color variable was not found in the available data you provided.",
+ "No color mapped.")
+ color <- NULL
+ }
+ }
+ if(!is.null(shape)){
+ if(!shape %in% names(DF)){
+ warning("Shape variable was not found in the available data you provided.",
+ "No shape mapped.")
+ shape <- NULL
+ }
+ }
+ if(!is.null(label)){
+ if(!label %in% names(DF)){
+ warning("Label variable was not found in the available data you provided.",
+ "No label mapped.")
+ label <- NULL
+ }
+ }
+ # Grab the ordination axis names from the plot data frame (as strings)
+ x = colnames(DF)[1]
+ y = colnames(DF)[2]
+ # Mapping section
+ if( ncol(DF) <= 2){
+ # If there is nothing to map, enforce simple mapping.
+ message("No available covariate data to map on the points for this plot `type`")
+ ord_map = aes_string(x=x, y=y)
+ } else if( type %in% c("sites", "species", "split") ){
+ ord_map = aes_string(x=x, y=y, color=color, shape=shape, na.rm=TRUE)
+ } else if(type=="biplot"){
+ # biplot, `id.type` should try to map to color and size. Only size if color specified.
+ if( is.null(color) ){
+ ord_map = aes_string(x=x, y=y, size="id.type", color="id.type", shape=shape, na.rm=TRUE)
+ } else {
+ ord_map = aes_string(x=x, y=y, size="id.type", color=color, shape=shape, na.rm=TRUE)
+ }
+ }
+ # Plot-building section
+ p <- ggplot(DF, ord_map) + geom_point(na.rm=TRUE)
+ # split/facet color and shape can be anything in one or other.
+ if( type=="split" ){
+ # split-option requires a facet_wrap
+ p <- p + facet_wrap(~id.type, nrow=1)
+ }
+ # If biplot, adjust scales
+ if( type=="biplot" ){
+ if( is.null(color) ){
+ # Rename color title in legend.
+ p <- update_labels(p, list(colour="Ordination Type"))
+ }
+ # Adjust size so that samples are bigger than taxa by default.
+ p <- p + scale_size_manual("type", values=c(Samples=5, Taxa=2))
+ }
+ # Add text labels to points
+ if( !is.null(label) ){
+ label_map <- aes_string(x=x, y=y, label=label, na.rm=TRUE)
+ p = p + geom_text(label_map, data=rm.na.phyloseq(DF, label),
+ size=2, vjust=1.5, na.rm=TRUE)
+ }
+ # Optionally add a title to the plot
+ if( !is.null(title) ){
+ p = p + ggtitle(title)
+ }
+ # Add fraction variability to axis labels, if available
+ if( length(extract_eigenvalue(ordination)[axes]) > 0 ){
+ # Only attempt to add fraction variability
+ # if extract_eigenvalue returns something
+ eigvec = extract_eigenvalue(ordination)
+ # Fraction variability, fracvar
+ fracvar = eigvec[axes] / sum(eigvec)
+ # Percent variability, percvar
+ percvar = round(100*fracvar, 1)
+ # The string to add to each axis label, strivar
+ # Start with the curent axis labels in the plot
+ strivar = as(c(p$label$x, p$label$y), "character")
+ # paste the percent variability string at the end
+ strivar = paste0(strivar, " [", percvar, "%]")
+ # Update the x-label and y-label
+ p = p + xlab(strivar[1]) + ylab(strivar[2])
+ }
+ # Return the ggplot object
+ return(p)
+}
+################################################################################
+# Remove NA elements from data.frame prior to plotting
+# Remove NA level from factor
+################################################################################
+#' @keywords internal
+rm.na.phyloseq <- function(DF, key.var){
+ # (1) Remove elements from DF if key.var has NA
+ # DF[!is.na(DF[, key.var]), ]
+ DF <- subset(DF, !is.na(eval(parse(text=key.var))))
+ # (2) Remove NA from the factor level, if a factor.
+ if( class(DF[, key.var]) == "factor" ){
+ DF[, key.var] <- factor(as(DF[, key.var], "character"))
+ }
+ return(DF)
+}
+################################################################################
+#' @keywords internal
+#' @importFrom plyr is.discrete
+rp.joint.fill <- function(DF, map.var, id.type.rp="samples"){
+ # If all of the map.var values for samples/species are NA, replace with id.type.rp
+ if( all(is.na(DF[DF$id.type==id.type.rp, map.var])) ){
+ # If discrete, coerce to character, convert to factor, replace, relevel.
+ if( is.discrete(DF[, map.var]) ){
+ temp.vec <- as(DF[, map.var], "character")
+ temp.vec[is.na(temp.vec)] <- id.type.rp
+ DF[, map.var] <- relevel(factor(temp.vec), id.type.rp)
+ }
+ }
+ return(DF)
+}
+################################################################################
+#' Subset points from an ordination-derived ggplot
+#'
+#' Easily retrieve a plot-derived \code{data.frame} with a subset of points
+#' according to a threshold and method. The meaning of the threshold depends
+#' upon the method. See argument description below.
+#' There are many useful examples of phyloseq ordination graphics in the
+#' \href{http://joey711.github.io/phyloseq/subset_ord_plot-examples}{phyloseq online tutorials}.
+#'
+#' @usage subset_ord_plot(p, threshold=0.05, method="farthest")
+#'
+#' @param p (Required). A \code{\link{ggplot}} object created by
+#' \code{\link{plot_ordination}}. It contains the complete data that you
+#' want to subset.
+#'
+#' @param threshold (Optional). A numeric scalar. Default is \code{0.05}.
+#' This value determines a coordinate threshold or population threshold,
+#' depending on the value of the \code{method} argument, ultimately
+#' determining which points are included in returned \code{data.frame}.
+#'
+#' @param method (Optional). A character string. One of
+#' \code{c("farthest", "radial", "square")}. Default is \code{"farthest"}.
+#' This determines how threshold will be interpreted.
+#'
+#' \describe{
+#'
+#' \item{farthest}{
+#' Unlike the other two options, this option implies removing a
+#' certain fraction or number of points from the plot, depending
+#' on the value of \code{threshold}. If \code{threshold} is greater
+#' than or equal to \code{1}, then all but \code{threshold} number
+#' of points farthest from the origin are removed. Otherwise, if
+#' \code{threshold} is less than \code{1}, all but \code{threshold}
+#' fraction of points farthests from origin are retained.
+#' }
+#'
+#' \item{radial}{
+#' Keep only those points that are beyond \code{threshold}
+#' radial distance from the origin. Has the effect of removing a
+#' circle of points from the plot, centered at the origin.
+#' }
+#'
+#' \item{square}{
+#' Keep only those points with at least one coordinate
+#' greater than \code{threshold}. Has the effect of removing a
+#' ``square'' of points from the plot, centered at the origin.
+#' }
+#'
+#' }
+#'
+#' @return A \code{\link{data.frame}} suitable for creating a
+#' \code{\link{ggplot}} plot object, graphically summarizing
+#' the ordination result according to previously-specified parameters.
+#'
+#' @seealso
+#' \href{http://joey711.github.io/phyloseq/subset_ord_plot-examples}{phyloseq online tutorial} for this function.
+#'
+#' \code{\link{plot_ordination}}
+#'
+#'
+#' @export
+#' @examples
+#' ## See the online tutorials.
+#' ## http://joey711.github.io/phyloseq/subset_ord_plot-examples
+subset_ord_plot <- function(p, threshold=0.05, method="farthest"){
+ threshold <- threshold[1] # ignore all but first threshold value.
+ method <- method[1] # ignore all but first string.
+ method.names <- c("farthest", "radial", "square")
+ # Subset to only some small fraction of points
+ # with furthest distance from origin
+ df <- p$data[, c(1, 2)]
+ d <- sqrt(df[, 1]^2 + df[, 2]^2)
+ names(d) <- rownames(df)
+ if( method.names[pmatch(method, method.names)] == "farthest"){
+ if( threshold >= 1){
+ show.names <- names(sort(d, TRUE)[1:threshold])
+ } else if( threshold < 1 ){
+ show.names <- names(sort(d, TRUE)[1:round(threshold*length(d))])
+ } else {
+ stop("threshold not a valid positive numeric scalar")
+ }
+ } else if( method.names[pmatch(method, method.names)] == "radial"){
+ show.names <- names(d[d > threshold])
+ } else if( method.names[pmatch(method, method.names)] == "square"){
+ # show.names <- rownames(df)[as.logical((abs(df[, 1]) > threshold) + (abs(df[, 2]) > threshold))]
+ show.names <- rownames(df)[((abs(df[, 1]) > threshold) | (abs(df[, 2]) > threshold))]
+ } else {
+ stop("method name not supported. Please select a valid method")
+ }
+
+ return(p$data[show.names, ])
+}
+################################################################################
+#' General ordination eigenvalue plotter using ggplot2.
+#'
+#' Convenience wrapper for plotting ordination eigenvalues (if available)
+#' using a \code{ggplot2}-graphic.
+#'
+#' @param ordination (Required). An ordination object. Many different classes
+#' of ordination are defined by \code{R} packages. Ordination classes
+#' currently supported/created by the \code{\link{ordinate}} function are
+#' supported here.
+#' There is no default, as the expectation is that the
+#' ordination will be performed and saved prior to calling this plot function.
+#'
+#' @param title (Optional). Default \code{NULL}. Character string.
+#' The main title for the graphic.
+#'
+#' @return A \code{\link{ggplot}} plot object, graphically summarizing
+#' the ordination result for the specified axes.
+#'
+#' @seealso
+#'
+#' \code{\link{plot_ordination}}
+#'
+#' \code{\link{ordinate}}
+#'
+#' \code{\link{distance}}
+#'
+#' \href{http://joey711.github.io/phyloseq/plot_ordination-examples}{phyloseq online tutorials}
+#'
+#' @importFrom ggplot2 geom_bar
+#' @importFrom ggplot2 scale_x_discrete
+#' @importFrom ggplot2 element_text
+#'
+#' @export
+#' @examples
+#' # First load and trim a dataset
+#' data("GlobalPatterns")
+#' GP = prune_taxa(names(sort(taxa_sums(GlobalPatterns), TRUE)[1:50]), GlobalPatterns)
+#' # Test plots (preforms ordination in-line, then makes scree plot)
+#' plot_scree(ordinate(GP, "DPCoA", "bray"))
+#' plot_scree(ordinate(GP, "PCoA", "bray"))
+#' # Empty return with message
+#' plot_scree(ordinate(GP, "NMDS", "bray"))
+#' # Constrained ordinations
+#' plot_scree(ordinate(GP, "CCA", formula=~SampleType))
+#' plot_scree(ordinate(GP, "RDA", formula=~SampleType))
+#' plot_scree(ordinate(GP, "CAP", formula=~SampleType))
+#' # Deprecated example of constrained ordination (emits a warning)
+#' #plot_scree(ordinate(GP ~ SampleType, "RDA"))
+#' plot_scree(ordinate(GP, "DCA"))
+#' plot_ordination(GP, ordinate(GP, "DCA"), type="scree")
+plot_scree = function(ordination, title=NULL){
+ # Use get_eigenvalue method dispatch. It always returns a numeric vector.
+ x = extract_eigenvalue(ordination)
+ # Were eigenvalues found? If not, return NULL
+ if( is.null(x) ){
+ cat("No eigenvalues found in ordination\n")
+ return(NULL)
+ } else {
+ # If no names, add them arbitrarily "axis1, axis2, ..., axisN"
+ if( is.null(names(x)) ) names(x) = 1:length(x)
+ # For scree plot, want to show the fraction of total eigenvalues
+ x = x/sum(x)
+ # Set negative values to zero
+ x[x <= 0.0] = 0.0
+ # Create the ggplot2 data.frame, and basic ggplot2 plot
+ gdf = data.frame(axis=names(x), eigenvalue = x)
+ p = ggplot(gdf, aes(x=axis, y=eigenvalue)) + geom_bar(stat="identity")
+ # Force the order to be same as original in x
+ p = p + scale_x_discrete(limits = names(x))
+ # Orient the x-labels for space.
+ p = p + theme(axis.text.x=element_text(angle=90, vjust=0.5))
+ # Optionally add a title to the plot
+ if( !is.null(title) ){
+ p <- p + ggtitle(title)
+ }
+ return(p)
+ }
+}
+################################################################################
+# Define S3 generic extract_eigenvalue function; formerly S4 generic get_eigenvalue()
+# Function is used by `plot_scree` to get the eigenvalue vector from different
+# types of ordination objects.
+# Used S3 generic in this case because many ordination objects, the input, are
+# not formally-defined S4 classes, but vaguely-/un-defined S3. This throws
+# warnings during package build if extract_eigenvalue were S4 generic method,
+# because the ordination classes don't appear to have any definition in phyloseq
+# or dependencies.
+#' @keywords internal
+extract_eigenvalue = function(ordination) UseMethod("extract_eigenvalue", ordination)
+# Default is to return NULL (e.g. for NMDS, or non-supported ordinations/classes).
+extract_eigenvalue.default = function(ordination) NULL
+# for pcoa objects
+extract_eigenvalue.pcoa = function(ordination) ordination$values$Relative_eig
+# for CCA objects
+extract_eigenvalue.cca = function(ordination) c(ordination$CCA$eig, ordination$CA$eig)
+# for RDA objects
+extract_eigenvalue.rda = function(ordination) c(ordination$CCA$eig, ordination$CA$eig)
+# for dpcoa objects
+extract_eigenvalue.dpcoa = function(ordination) ordination$eig
+# for decorana (dca) objects
+extract_eigenvalue.decorana = function(ordination) ordination$evals
+################################################################################
+#' Melt phyloseq data object into large data.frame
+#'
+#' The psmelt function is a specialized melt function for melting phyloseq objects
+#' (instances of the phyloseq class), usually for producing graphics
+#' with \code{\link[ggplot2]{ggplot}2}. \code{psmelt} relies heavily on the
+#' \code{\link[reshape2]{melt}} and \code{\link{merge}} functions.
+#' The naming conventions used in downstream phyloseq graphics functions
+#' have reserved the following variable names that should not be used
+#' as the names of \code{\link{sample_variables}}
+#' or taxonomic \code{\link{rank_names}}.
+#' These reserved names are \code{c("Sample", "Abundance", "OTU")}.
+#' Also, you should not have identical names for
+#' sample variables and taxonomic ranks.
+#' That is, the intersection of the output of the following two functions
+#' \code{\link{sample_variables}}, \code{\link{rank_names}}
+#' should be an empty vector
+#' (e.g. \code{intersect(sample_variables(physeq), rank_names(physeq))}).
+#' All of these potential name collisions are checked-for
+#' and renamed automtically with a warning.
+#' However, if you (re)name your variables accordingly ahead of time,
+#' it will reduce confusion and eliminate the warnings.
+#'
+#' Note that
+#' ``melted'' phyloseq data is stored much less efficiently,
+#' and so RAM storage issues could arise with a smaller dataset
+#' (smaller number of samples/OTUs/variables) than one might otherwise expect.
+#' For common sizes of graphics-ready datasets, however,
+#' this should not be a problem.
+#' Because the number of OTU entries has a large effect on the RAM requirement,
+#' methods to reduce the number of separate OTU entries --
+#' for instance by agglomerating OTUs based on phylogenetic distance
+#' using \code{\link{tip_glom}} --
+#' can help alleviate RAM usage problems.
+#' This function is made user-accessible for flexibility,
+#' but is also used extensively by plot functions in phyloseq.
+#'
+#' @usage psmelt(physeq)
+#'
+#' @param physeq (Required). An \code{\link{otu_table-class}} or
+#' \code{\link{phyloseq-class}}. Function most useful for phyloseq-class.
+#'
+#' @return A \code{\link{data.frame}}-class table.
+#'
+#' @seealso
+#' \code{\link{plot_bar}}
+#'
+#' \code{\link[reshape2]{melt}}
+#'
+#' \code{\link{merge}}
+#'
+#' @import reshape2
+#'
+#' @export
+#'
+#' @examples
+#' data("GlobalPatterns")
+#' gp.ch = subset_taxa(GlobalPatterns, Phylum == "Chlamydiae")
+#' mdf = psmelt(gp.ch)
+#' nrow(mdf)
+#' ncol(mdf)
+#' colnames(mdf)
+#' head(rownames(mdf))
+#' # Create a ggplot similar to
+#' library("ggplot2")
+#' p = ggplot(mdf, aes(x=SampleType, y=Abundance, fill=Genus))
+#' p = p + geom_bar(color="black", stat="identity", position="stack")
+#' print(p)
+psmelt = function(physeq){
+ # Access covariate names from object, if present
+ if(!inherits(physeq, "phyloseq")){
+ rankNames = NULL
+ sampleVars = NULL
+ } else {
+ # Still might be NULL, but attempt access
+ rankNames = rank_names(physeq, FALSE)
+ sampleVars = sample_variables(physeq, FALSE)
+ }
+ # Define reserved names
+ reservedVarnames = c("Sample", "Abundance", "OTU")
+ # type-1a conflict: between sample_data
+ # and reserved psmelt variable names
+ type1aconflict = intersect(reservedVarnames, sampleVars)
+ if(length(type1aconflict) > 0){
+ wh1a = which(sampleVars %in% type1aconflict)
+ new1a = paste0("sample_", sampleVars[wh1a])
+ # First warn about the change
+ warning("The sample variables: \n",
+ paste(sampleVars[wh1a], collapse=", "),
+ "\n have been renamed to: \n",
+ paste0(new1a, collapse=", "), "\n",
+ "to avoid conflicts with special phyloseq plot attribute names.")
+ # Rename the sample variables.
+ colnames(sample_data(physeq))[wh1a] <- new1a
+ }
+ # type-1b conflict: between tax_table
+ # and reserved psmelt variable names
+ type1bconflict = intersect(reservedVarnames, rankNames)
+ if(length(type1bconflict) > 0){
+ wh1b = which(rankNames %in% type1bconflict)
+ new1b = paste0("taxa_", rankNames[wh1b])
+ # First warn about the change
+ warning("The rank names: \n",
+ paste(rankNames[wh1b], collapse=", "),
+ "\n have been renamed to: \n",
+ paste0(new1b, collapse=", "), "\n",
+ "to avoid conflicts with special phyloseq plot attribute names.")
+ # Rename the conflicting taxonomic ranks
+ colnames(tax_table(physeq))[wh1b] <- new1b
+ }
+ # type-2 conflict: internal between tax_table and sample_data
+ type2conflict = intersect(sampleVars, rankNames)
+ if(length(type2conflict) > 0){
+ wh2 = which(sampleVars %in% type2conflict)
+ new2 = paste0("sample_", sampleVars[wh2])
+ # First warn about the change
+ warning("The sample variables: \n",
+ paste0(sampleVars[wh2], collapse=", "),
+ "\n have been renamed to: \n",
+ paste0(new2, collapse=", "), "\n",
+ "to avoid conflicts with taxonomic rank names.")
+ # Rename the sample variables
+ colnames(sample_data(physeq))[wh2] <- new2
+ }
+ # Enforce OTU table orientation. Redundant-looking step
+ # supports "naked" otu_table as `physeq` input.
+ otutab = otu_table(physeq)
+ if(!taxa_are_rows(otutab)){otutab <- t(otutab)}
+ # Melt the OTU table: wide form to long form table
+ mdf = reshape2::melt(as(otutab, "matrix"))
+ colnames(mdf)[1] <- "OTU"
+ colnames(mdf)[2] <- "Sample"
+ colnames(mdf)[3] <- "Abundance"
+ # Row and Col names are coerced to integer or factor if possible.
+ # Do not want this. Coerce these to character.
+ # e.g. `OTU` should always be discrete, even if OTU ID values can be coerced to integer
+ mdf$OTU <- as.character(mdf$OTU)
+ mdf$Sample <- as.character(mdf$Sample)
+ # Merge the sample data.frame if present
+ if(!is.null(sampleVars)){
+ sdf = data.frame(sample_data(physeq), stringsAsFactors=FALSE)
+ sdf$Sample <- sample_names(physeq)
+ # merge the sample-data and the melted otu table
+ mdf <- merge(mdf, sdf, by.x="Sample")
+ }
+ # Next merge taxonomy data, if present
+ if(!is.null(rankNames)){
+ TT = access(physeq, "tax_table")
+ # First, check for empty TT columns (all NA)
+ keepTTcols <- colSums(is.na(TT)) < ntaxa(TT)
+ # Protect against all-empty columns, or col-less matrix
+ if(length(which(keepTTcols)) > 0 & ncol(TT) > 0){
+ # Remove the empty columns
+ TT <- TT[, keepTTcols]
+ # Add TT to the "psmelt" data.frame
+ tdf = data.frame(TT, OTU=taxa_names(physeq))
+ # Now add to the "psmelt" output data.frame, `mdf`
+ mdf <- merge(mdf, tdf, by.x="OTU")
+ }
+ }
+ # Sort the entries by abundance
+ mdf = mdf[order(mdf$Abundance, decreasing=TRUE), ]
+ return(mdf)
+}
+################################################################################
+#' A flexible, informative barplot phyloseq data
+#'
+#' There are many useful examples of phyloseq barplot graphics in the
+#' \href{http://joey711.github.io/phyloseq/plot_bar-examples}{phyloseq online tutorials}.
+#' This function wraps \code{ggplot2} plotting, and returns a \code{ggplot2}
+#' graphic object
+#' that can be saved or further modified with additional layers, options, etc.
+#' The main purpose of this function is to quickly and easily create informative
+#' summary graphics of the differences in taxa abundance between samples in
+#' an experiment.
+#'
+#' @usage plot_bar(physeq, x="Sample", y="Abundance", fill=NULL,
+#' title=NULL, facet_grid=NULL)
+#'
+#' @param physeq (Required). An \code{\link{otu_table-class}} or
+#' \code{\link{phyloseq-class}}.
+#'
+#' @param x (Optional). Optional, but recommended, especially if your data
+#' is comprised of many samples. A character string.
+#' The variable in the melted-data that should be mapped to the x-axis.
+#' See \code{\link{psmelt}}, \code{\link{melt}},
+#' and \code{\link{ggplot}} for more details.
+#'
+#' @param y (Optional). A character string.
+#' The variable in the melted-data that should be mapped to the y-axis.
+#' Typically this will be \code{"Abundance"}, in order to
+#' quantitatively display the abundance values for each OTU/group.
+#' However, alternative variables could be used instead,
+#' producing a very different, though possibly still informative, plot.
+#' See \code{\link{psmelt}}, \code{\link{melt}},
+#' and \code{\link{ggplot}} for more details.
+#'
+#' @param fill (Optional). A character string. Indicates which sample variable
+#' should be used to map to the fill color of the bars.
+#' The default is \code{NULL}, resulting in a gray fill for all bar segments.
+#'
+#' @param facet_grid (Optional). A formula object.
+#' It should describe the faceting you want in exactly the same way as for
+#' \code{\link[ggplot2]{facet_grid}},
+#' and is ulitmately provided to \code{\link{ggplot}}2 graphics.
+#' The default is: \code{NULL}, resulting in no faceting.
+#'
+#' @param title (Optional). Default \code{NULL}. Character string.
+#' The main title for the graphic.
+#'
+#' @return A \code{\link[ggplot2]{ggplot}}2 graphic object -- rendered in the graphical device
+#' as the default \code{\link[base]{print}}/\code{\link[methods]{show}} method.
+#'
+#' @seealso
+#' \href{http://joey711.github.io/phyloseq/plot_bar-examples}{phyloseq online tutorials}.
+#'
+#' \code{\link{psmelt}}
+#'
+#' \code{\link{ggplot}}
+#'
+#' \code{\link{qplot}}
+#'
+#'
+#' @importFrom ggplot2 aes_string
+#' @importFrom ggplot2 geom_bar
+#' @importFrom ggplot2 facet_grid
+#' @importFrom ggplot2 element_text
+#'
+#' @export
+#'
+#' @examples
+#' data("GlobalPatterns")
+#' gp.ch = subset_taxa(GlobalPatterns, Phylum == "Chlamydiae")
+#' plot_bar(gp.ch)
+#' plot_bar(gp.ch, fill="Genus")
+#' plot_bar(gp.ch, x="SampleType", fill="Genus")
+#' plot_bar(gp.ch, "SampleType", fill="Genus", facet_grid=~Family)
+#' # See additional examples in the plot_bar online tutorial. Link above.
+plot_bar = function(physeq, x="Sample", y="Abundance", fill=NULL,
+ title=NULL, facet_grid=NULL){
+
+ # Start by melting the data in the "standard" way using psmelt.
+ mdf = psmelt(physeq)
+
+ # Build the plot data structure
+ p = ggplot(mdf, aes_string(x=x, y=y, fill=fill))
+
+ # Add the bar geometric object. Creates a basic graphic. Basis for the rest.
+ # Test weather additional
+ p = p + geom_bar(stat="identity", position="stack", color="black")
+
+ # By default, rotate the x-axis labels (they might be long)
+ p = p + theme(axis.text.x=element_text(angle=-90, hjust=0))
+
+ # Add faceting, if given
+ if( !is.null(facet_grid) ){
+ p <- p + facet_grid(facet_grid)
+ }
+
+ # Optionally add a title to the plot
+ if( !is.null(title) ){
+ p <- p + ggtitle(title)
+ }
+
+ return(p)
+}
+################################################################################
+# plot_tree section.
+################################################################################
+#' Returns a data table defining the line segments of a phylogenetic tree.
+#'
+#' This function takes a \code{\link{phylo}} or \code{\link{phyloseq-class}} object
+#' and returns a list of two \code{\link{data.table}}s suitable for plotting
+#' a phylogenetic tree with \code{\link[ggplot2]{ggplot}}2.
+#'
+#' @param phy (Required). The \code{\link{phylo}} or \code{\link{phyloseq-class}}
+#' object (which must contain a \code{\link{phylo}}genetic tree)
+#' that you want to converted to \code{\link{data.table}}s
+#' suitable for plotting with \code{\link[ggplot2]{ggplot}}2.
+#'
+#' @param ladderize (Optional). Boolean or character string (either
+#' \code{FALSE}, \code{TRUE}, or \code{"left"}).
+#' Default is \code{FALSE} (no ladderization).
+#' This parameter specifies whether or not to \code{\link[ape]{ladderize}} the tree
+#' (i.e., reorder nodes according to the depth of their enclosed
+#' subtrees) prior to plotting.
+#' This tends to make trees more aesthetically pleasing and legible in
+#' a graphical display.
+#' When \code{TRUE} or \code{"right"}, ``right'' ladderization is used.
+#' When set to \code{FALSE}, no ladderization is applied.
+#' When set to \code{"left"}, the reverse direction
+#' (``left'' ladderization) is applied.
+#'
+#' @return
+#' A list of two \code{\link{data.table}}s, containing respectively
+#' a \code{data.table} of edge segment coordinates, named \code{edgeDT},
+#' and a \code{data.table} of vertical connecting segments, named \code{vertDT}.
+#' See \code{example} below for a simple demonstration.
+#'
+#' @seealso
+#' An early example of this functionality was borrowed directly, with permission,
+#' from the package called \code{ggphylo},
+#' released on GitHub at:
+#' \url{https://github.com/gjuggler/ggphylo}
+#' by its author Gregory Jordan \email{gjuggler@@gmail.com}.
+#' That original phyloseq internal function, \code{tree.layout}, has been
+#' completely replaced by this smaller and much faster user-accessible
+#' function that utilizes performance enhancements from standard
+#' \code{\link{data.table}} magic as well as \code{\link{ape-package}}
+#' internal C code.
+#'
+#' @importFrom ape ladderize
+#' @importFrom ape reorder.phylo
+#'
+#' @importFrom data.table data.table
+#' @importFrom data.table setkey
+#'
+#' @export
+#' @examples
+#' library("ggplot2")
+#' data("esophagus")
+#' phy = phy_tree(esophagus)
+#' phy <- ape::root(phy, "65_2_5", resolve.root=TRUE)
+#' treeSegs0 = tree_layout(phy)
+#' treeSegs1 = tree_layout(esophagus)
+#' edgeMap = aes(x=xleft, xend=xright, y=y, yend=y)
+#' vertMap = aes(x=x, xend=x, y=vmin, yend=vmax)
+#' p0 = ggplot(treeSegs0$edgeDT, edgeMap) + geom_segment() + geom_segment(vertMap, data=treeSegs0$vertDT)
+#' p1 = ggplot(treeSegs1$edgeDT, edgeMap) + geom_segment() + geom_segment(vertMap, data=treeSegs1$vertDT)
+#' print(p0)
+#' print(p1)
+#' plot_tree(esophagus, "treeonly")
+#' plot_tree(esophagus, "treeonly", ladderize="left")
+tree_layout = function(phy, ladderize=FALSE){
+ if(inherits(phy, "phyloseq")){
+ phy = phy_tree(phy)
+ }
+ if(!inherits(phy, "phylo")){
+ stop("tree missing or invalid. Please check `phy` argument and try again.")
+ }
+ if(is.null(phy$edge.length)){
+ # If no edge lengths, set them all to value of 1 (dendrogram).
+ phy$edge.length <- rep(1L, times=nrow(phy$edge))
+ }
+ # Perform ladderizing, if requested
+ if(ladderize != FALSE){
+ if(ladderize == "left"){
+ phy <- ladderize(phy, FALSE)
+ } else if(ladderize==TRUE | ladderize=="right"){
+ phy <- ladderize(phy, TRUE)
+ } else {
+ stop("You did not specify a supported option for argument `ladderize`.")
+ }
+ }
+ # 'z' is the tree in postorder order used in calls to .C
+ # Descending order of left-hand side of edge (the ancestor to the node)
+ z = reorder.phylo(phy, order="postorder")
+ # Initialize some characteristics of the tree.
+ Nedge = nrow(phy$edge)[1]
+ Nnode = phy$Nnode
+ Ntip = length(phy$tip.label)
+ ROOT = Ntip + 1
+ TIPS = phy$edge[(phy$edge[, 2] <= Ntip), 2]
+ NODES = (ROOT):(Ntip + Nnode)
+ nodelabels = phy$node.label
+ # Call phyloseq-internal function that in-turn calls ape's internal
+ # horizontal position function, in C, using the re-ordered phylo object.
+ xx = ape_node_depth_edge_length(Ntip, Nnode, z$edge, Nedge, z$edge.length)
+ # Initialize `yy`, before passing to ape internal function in C.
+ yy <- numeric(Ntip + Nnode)
+ yy[TIPS] <- 1:Ntip
+ # Define the ape_node_height wrapping function
+ ape_node_height <- function(Ntip, Nnode, edge, Nedge, yy){
+ .C(ape:::node_height, PACKAGE="ape",
+ as.integer(Ntip), as.integer(Nnode),
+ as.integer(edge[, 1]), as.integer(edge[, 2]),
+ as.integer(Nedge), as.double(yy))[[6]]
+ }
+ # The call in ape
+ #yy <- .nodeHeight(Ntip, Nnode, z$edge, Nedge, yy)
+ yy <- ape_node_height(Ntip, Nnode, z$edge, Nedge, yy)
+ # Initialize an edge data.table
+ # Don't set key, order matters
+ edgeDT = data.table(phy$edge, edge.length=phy$edge.length, OTU=NA_character_)
+ # Add tip.labels if present
+ if(!is.null(phy$tip.label)){
+ # Initialize OTU, set node (V2) as key, assign taxa_names as OTU label
+ edgeDT[, OTU:=NA_character_]
+ setkey(edgeDT, V2)
+ edgeDT[V2 <= Ntip, OTU:=phy$tip.label]
+ }
+ # Add the mapping for each edge defined in `xx` and `yy`
+ edgeDT[, xleft:=xx[V1]]
+ edgeDT[, xright:=xx[V2]]
+ edgeDT[, y:=yy[V2]]
+ # Next define vertical segments
+ vertDT = edgeDT[, list(x=xleft[1], vmin=min(y), vmax=max(y)), by=V1, mult="last"]
+ if(!is.null(phy$node.label)){
+ # Add non-root node labels to edgeDT
+ edgeDT[V2 > ROOT, x:=xright]
+ edgeDT[V2 > ROOT, label:=phy$node.label[-1]]
+ # Add root label (first node label) to vertDT
+ setkey(vertDT, V1)
+ vertDT[J(ROOT), y:=mean(c(vmin, vmax))]
+ vertDT[J(ROOT), label:=phy$node.label[1]]
+ }
+ return(list(edgeDT=edgeDT, vertDT=vertDT))
+}
+################################################################################
+# Define an internal function for determining what the text-size should be
+#' @keywords internal
+manytextsize <- function(n, mins=0.5, maxs=4, B=6, D=100){
+ # empirically selected size-value calculator.
+ s <- B * exp(-n/D)
+ # enforce a floor.
+ s <- ifelse(s > mins, s, mins)
+ # enforce a max
+ s <- ifelse(s < maxs, s, maxs)
+ return(s)
+}
+################################################################################
+# Return TRUE if the nodes of the tree in the phyloseq object provided are unlabeled.
+#' @keywords internal
+nodesnotlabeled = function(physeq){
+ if(is.null(phy_tree(physeq, FALSE))){
+ warning("There is no phylogenetic tree in the object you have provided. Try `phy_tree(physeq)` to see.")
+ return(TRUE)
+ } else {
+ return(is.null(phy_tree(physeq)$node.label) | length(phy_tree(physeq)$node.label)==0L)
+ }
+}
+# A quick test function to decide how nodes should be labeled by default, if at all.
+#
+#' @keywords internal
+howtolabnodes = function(physeq){
+ if(!nodesnotlabeled(physeq)){
+ # If the nodes are labeled, use a version of this function, taking into account `ntaxa`.
+ return(nodeplotdefault(manytextsize(ntaxa(physeq))))
+ } else {
+ # Else, use `nodeplotblank`, which returns the ggplot object as-is.
+ return(nodeplotblank)
+ }
+}
+################################################################################
+#' Function to avoid plotting node labels
+#'
+#' Unlike, \code{\link{nodeplotdefault}} and \code{\link{nodeplotboot}},
+#' this function does not return a function, but instead is provided
+#' directly to the \code{nodelabf} argument of \code{\link{plot_tree}} to
+#' ensure that node labels are not added to the graphic.
+#' Please note that you do not need to create or obtain the arguments to
+#' this function. Instead, you can provide this function directly to
+#' \code{\link{plot_tree}} and it will know what to do with it. Namely,
+#' use it to avoid plotting any node labels.
+#'
+#' @usage nodeplotblank(p, nodelabdf)
+#'
+#' @param p (Required). The \code{\link{plot_tree}} graphic.
+#'
+#' @param nodelabdf (Required). The \code{data.frame} produced internally in
+#' \code{link{plot_tree}} to use as data for creating ggplot2-based tree graphics.
+#'
+#' @return The same input object, \code{p}, provided as input. Unmodified.
+#'
+#' @seealso
+#' \code{\link{nodeplotdefault}}
+#'
+#' \code{\link{nodeplotboot}}
+#'
+#' \code{\link{plot_tree}}
+#'
+#'
+#' @export
+#' @examples
+#' data("esophagus")
+#' plot_tree(esophagus)
+#' plot_tree(esophagus, nodelabf=nodeplotblank)
+nodeplotblank = function(p, nodelabdf){
+ return(p)
+}
+################################################################################
+#' Generates a function for labeling bootstrap values on a phylogenetic tree.
+#'
+#' Is not a labeling function itself, but returns one.
+#' The returned function is specialized for labeling bootstrap values.
+#' Note that the function that
+#' is returned has two completely different arguments from the four listed here:
+#' the plot object already built by earlier steps in
+#' \code{\link{plot_tree}}, and the \code{\link{data.frame}}
+#' that contains the relevant plotting data for the nodes
+#' (especially \code{x, y, label}),
+#' respectively.
+#' See \code{\link{nodeplotdefault}} for a simpler example.
+#' The main purpose of this and \code{\link{nodeplotdefault}} is to
+#' provide a useful default function generator for arbitrary and
+#' bootstrap node labels, respectively, and also to act as
+#' examples of functions that can successfully interact with
+#' \code{\link{plot_tree}} to add node labels to the graphic.
+#'
+#' @usage nodeplotboot(highthresh=95L, lowcthresh=50L, size=2L, hjust=-0.2)
+#'
+#' @param highthresh (Optional). A single integer between 0 and 100.
+#' Any bootstrap values above this threshold will be annotated as
+#' a black filled circle on the node, rather than the bootstrap
+#' percentage value itself.
+#'
+#' @param lowcthresh (Optional). A single integer between 0 and 100,
+#' less than \code{highthresh}. Any bootstrap values below this value
+#' will not be added to the graphic. Set to 0 or below to add all
+#' available values.
+#'
+#' @param size (Optional). Numeric. Should be positive. The
+#' size parameter used to control the text size of taxa labels.
+#' Default is \code{2}. These are ggplot2 sizes.
+#'
+#' @param hjust (Optional). The horizontal justification of the
+#' node labels. Default is \code{-0.2}.
+#'
+#' @return A function that can add a bootstrap-values layer to the tree graphic.
+#' The values are represented in two ways; either as black filled circles
+#' indicating very high-confidence nodes, or the bootstrap value itself
+#' printed in small text next to the node on the tree.
+#'
+#' @seealso
+#' \code{\link{nodeplotdefault}}
+#'
+#' \code{\link{nodeplotblank}}
+#'
+#' \code{\link{plot_tree}}
+#'
+#'
+#' @export
+#' @examples
+#' nodeplotboot()
+#' nodeplotboot(3, -0.4)
+nodeplotboot = function(highthresh=95L, lowcthresh=50L, size=2L, hjust=-0.2){
+ function(p, nodelabdf){
+ # For bootstrap, check that the node labels can be coerced to numeric
+ try(boot <- as(as(nodelabdf$label, "character"), "numeric"), TRUE)
+ # Want NAs/NaN to propagate, but still need to test remainder
+ goodboot = boot[complete.cases(boot)]
+ if( !is(goodboot, "numeric") & length(goodboot) > 0 ){
+ stop("The node labels, phy_tree(physeq)$node.label, are not coercable to a numeric vector with any elements.")
+ }
+ # So they look even more like bootstraps and display well,
+ # force them to be between 0 and 100, rounded to 2 digits.
+ if( all( goodboot >= 0.0 & goodboot <= 1.0 ) ){
+ boot = round(boot, 2)*100L
+ }
+ nodelabdf$boot = boot
+ boottop = subset(nodelabdf, boot >= highthresh)
+ bootmid = subset(nodelabdf, boot > lowcthresh & boot < highthresh)
+ # Label the high-confidence nodes with a point.
+ if( nrow(boottop)>0L ){
+ p = p + geom_point(mapping=aes(x=x, y=y), data=boottop, na.rm=TRUE)
+ }
+ # Label the remaining bootstrap values as text at the nodes.
+ if( nrow(bootmid)>0L ){
+ bootmid$label = bootmid$boot
+ p = nodeplotdefault(size, hjust)(p, bootmid)
+ }
+ return(p)
+ }
+}
+################################################################################
+#' Generates a default node-label function
+#'
+#' Is not a labeling function itself, but returns one.
+#' The returned function is capable of adding
+#' whatever label is on a node. Note that the function that
+#' is returned has two completely different arguments to those listed here:
+#' the plot object already built by earlier steps in
+#' \code{\link{plot_tree}}, and the \code{\link{data.frame}}
+#' that contains the relevant plotting data for the nodes
+#' (especially \code{x, y, label}),
+#' respectively.
+#' See \code{\link{nodeplotboot}} for a more sophisticated example.
+#' The main purpose of this and \code{\link{nodeplotboot}} is to
+#' provide a useful default function generator for arbitrary and
+#' bootstrap node labels, respectively, and also to act as
+#' examples of functions that will successfully interact with
+#' \code{\link{plot_tree}} to add node labels to the graphic.
+#'
+#' @usage nodeplotdefault(size=2L, hjust=-0.2)
+#'
+#' @param size (Optional). Numeric. Should be positive. The
+#' size parameter used to control the text size of taxa labels.
+#' Default is \code{2}. These are ggplot2 sizes.
+#'
+#' @param hjust (Optional). The horizontal justification of the
+#' node labels. Default is \code{-0.2}.
+#'
+#' @return A function that can add a node-label layer to a graphic.
+#'
+#' @seealso
+#' \code{\link{nodeplotboot}}
+#'
+#' \code{\link{nodeplotblank}}
+#'
+#' \code{\link{plot_tree}}
+#'
+#' @export
+#'
+#' @examples
+#' nodeplotdefault()
+#' nodeplotdefault(3, -0.4)
+nodeplotdefault = function(size=2L, hjust=-0.2){
+ function(p, nodelabdf){
+ p = p + geom_text(mapping=aes(x=x,
+ y=y,
+ label=label),
+ data=nodelabdf,
+ size=size,
+ hjust=hjust,
+ na.rm=TRUE)
+ return(p)
+ }
+}
+################################################################################
+#' Plot a phylogenetic tree with optional annotations
+#'
+#' There are many useful examples of phyloseq tree graphics in the
+#' \href{http://joey711.github.io/phyloseq/plot_tree-examples}{phyloseq online tutorials}.
+#' This function is intended to facilitate easy graphical investigation of
+#' the phylogenetic tree, as well as sample data. Note that for phylogenetic
+#' sequencing of samples with large richness, some of the options in this
+#' function will be prohibitively slow to render, or too dense to be
+#' interpretable. A rough ``rule of thumb'' is to use subsets of data
+#' with not many more than 200 OTUs per plot, sometimes less depending on the
+#' complexity of the additional annotations being mapped to the tree. It is
+#' usually possible to create an unreadable, uninterpretable tree with modern
+#' datasets. However, the goal should be toward parameter settings and data
+#' subsets that convey (honestly, accurately) some biologically relevant
+#' feature of the data. One of the goals of the \code{\link{phyloseq-package}}
+#' is to make the determination of these features/settings as easy as possible.
+#'
+#' This function received an early development contribution from the work of
+#' Gregory Jordan via \href{https://github.com/gjuggler/ggphylo}{the ggphylo package}.
+#' \code{plot_tree} has since been re-written.
+#' For details see \code{\link{tree_layout}}.
+#'
+#' @param physeq (Required). The data about which you want to
+#' plot and annotate a phylogenetic tree, in the form of a
+#' single instance of the \code{\link{phyloseq-class}}, containing at
+#' minimum a phylogenetic tree component (try \code{\link{phy_tree}}).
+#' One of the major advantages of this function over basic tree-plotting utilities
+#' in the \code{\link{ape}}-package is the ability to easily annotate the tree
+#' with sample variables and taxonomic information. For these uses,
+#' the \code{physeq} argument should also have a \code{\link{sample_data}}
+#' and/or \code{\link{tax_table}} component(s).
+#'
+#' @param method (Optional). Character string. Default \code{"sampledodge"}.
+#' The name of the annotation method to use.
+#' This will be expanded in future versions.
+#' Currently only \code{"sampledodge"} and \code{"treeonly"} are supported.
+#' The \code{"sampledodge"} option results in points
+#' drawn next to leaves if individuals from that taxa were observed,
+#' and a separate point is drawn for each sample.
+#'
+#' @param nodelabf (Optional). A function. Default \code{NULL}.
+#' If \code{NULL}, the default, a function will be selected for you based upon
+#' whether or not there are node labels in \code{phy_tree(physeq)}.
+#' For convenience, the phyloseq package includes two generator functions
+#' for adding arbitrary node labels (can be any character string),
+#' \code{\link{nodeplotdefault}};
+#' as well as for adding bootstrap values in a certain range,
+#' \code{\link{nodeplotboot}}.
+#' To not have any node labels in the graphic, set this argument to
+#' \code{\link{nodeplotblank}}.
+#'
+#' @param color (Optional). Character string. Default \code{NULL}.
+#' The name of the variable in \code{physeq} to map to point color.
+#' Supported options here also include the reserved special variables
+#' of \code{\link{psmelt}}.
+#'
+#' @param shape (Optional). Character string. Default \code{NULL}.
+#' The name of the variable in \code{physeq} to map to point shape.
+#' Supported options here also include the reserved special variables
+#' of \code{\link{psmelt}}.
+#'
+#' @param size (Optional). Character string. Default \code{NULL}.
+#' The name of the variable in \code{physeq} to map to point size.
+#' A special argument \code{"abundance"} is reserved here and scales
+#' point size using abundance in each sample on a log scale.
+#' Supported options here also include the reserved special variables
+#' of \code{\link{psmelt}}.
+#'
+#' @param min.abundance (Optional). Numeric.
+#' The minimum number of individuals required to label a point
+#' with the precise number.
+#' Default is \code{Inf},
+#' meaning that no points will have their abundance labeled.
+#' If a vector, only the first element is used.
+#'
+#' @param label.tips (Optional). Character string. Default is \code{NULL},
+#' indicating that no tip labels will be printed.
+#' If \code{"taxa_names"}, then the name of the taxa will be added
+#' to the tree; either next to the leaves, or next to
+#' the set of points that label the leaves. Alternatively,
+#' if this is one of the rank names (from \code{rank_names(physeq)}),
+#' then the identity (if any) for that particular taxonomic rank
+#' is printed instead.
+#'
+#' @param text.size (Optional). Numeric. Should be positive. The
+#' size parameter used to control the text size of taxa labels.
+#' Default is \code{NULL}. If left \code{NULL}, this function
+#' will automatically calculate a (hopefully) optimal text size
+#' given the vertical constraints posed by the tree itself.
+#' This argument is included in case the
+#' automatically-calculated size is wrong, and you want to change it.
+#' Note that this parameter is only meaningful if \code{label.tips}
+#' is not \code{NULL}.
+#'
+#' @param sizebase (Optional). Numeric. Should be positive.
+#' The base of the logarithm used
+#' to scale point sizes to graphically represent abundance of
+#' species in a given sample. Default is 5.
+#'
+#' @param base.spacing (Optional). Numeric. Default is \code{0.02}.
+#' Should be positive.
+#' This defines the base-spacing between points at each tip/leaf in the
+#' the tree. The larger this value, the larger the spacing between points.
+#' This is useful if you have problems with overlapping large points
+#' and/or text indicating abundance, for example. Similarly, if you
+#' don't have this problem and want tighter point-spacing, you can
+#' shrink this value.
+#'
+#' @param ladderize (Optional). Boolean or character string (either
+#' \code{FALSE}, \code{TRUE}, or \code{"left"}).
+#' Default is \code{FALSE}.
+#' This parameter specifies whether or not to \code{\link[ape]{ladderize}} the tree
+#' (i.e., reorder nodes according to the depth of their enclosed
+#' subtrees) prior to plotting.
+#' This tends to make trees more aesthetically pleasing and legible in
+#' a graphical display.
+#' When \code{TRUE} or \code{"right"}, ``right'' ladderization is used.
+#' When set to \code{FALSE}, no ladderization is applied.
+#' When set to \code{"left"}, the reverse direction
+#' (``left'' ladderization) is applied.
+#' This argument is passed on to \code{\link{tree_layout}}.
+#'
+#' @param plot.margin (Optional). Numeric. Default is \code{0.2}.
+#' Should be positive.
+#' This defines how much right-hand padding to add to the tree plot,
+#' which can be required to not truncate tip labels. The margin value
+#' is specified as a fraction of the overall tree width which is added
+#' to the right side of the plot area. So a value of \code{0.2} adds
+#' twenty percent extra space to the right-hand side of the plot.
+#'
+#' @param title (Optional). Default \code{NULL}. Character string.
+#' The main title for the graphic.
+#'
+#' @param treetheme (Optional).
+#' A custom \code{\link{ggplot}}2 \code{\link[ggplot2]{theme}} layer
+#' to use for the tree. Supplants any default theme layers
+#' used within the function.
+#' A value of \code{NULL} uses a default, minimal-annotations theme.
+#' If anything other than a them or \code{NULL}, the current global ggplot2
+#' theme will result.
+#'
+#' @param justify (Optional). A character string indicating the
+#' type of justification to use on dodged points and tip labels.
+#' A value of \code{"jagged"}, the default, results in
+#' these tip-mapped elements being spaced as close to the tips as possible
+#' without gaps.
+#' Currently, any other value for \code{justify} results in
+#' a left-justified arrangement of both labels and points.
+#'
+#' @return A \code{\link{ggplot}}2 plot.
+#'
+#' @seealso
+#' \code{\link{plot.phylo}}
+#'
+#' There are many useful examples of phyloseq tree graphics in the
+#' \href{http://joey711.github.io/phyloseq/plot_tree-examples}{phyloseq online tutorials}.
+#'
+#' @importFrom scales log_trans
+#'
+#' @importFrom data.table setkey
+#' @importFrom data.table setkeyv
+#'
+#' @importFrom ggplot2 geom_segment
+#' @importFrom ggplot2 scale_x_continuous
+#' @importFrom ggplot2 scale_size_continuous
+#' @importFrom ggplot2 element_blank
+#'
+#' @export
+#' @examples
+#' # # Using plot_tree() with the esophagus dataset.
+#' # # Please note that many more interesting examples are shown
+#' # # in the online tutorials"
+#' # # http://joey711.github.io/phyloseq/plot_tree-examples
+#' data(esophagus)
+#' # plot_tree(esophagus)
+#' # plot_tree(esophagus, color="Sample")
+#' # plot_tree(esophagus, size="Abundance")
+#' # plot_tree(esophagus, size="Abundance", color="samples")
+#' plot_tree(esophagus, size="Abundance", color="Sample", base.spacing=0.03)
+#' plot_tree(esophagus, size="abundance", color="samples", base.spacing=0.03)
+plot_tree = function(physeq, method="sampledodge", nodelabf=NULL,
+ color=NULL, shape=NULL, size=NULL,
+ min.abundance=Inf, label.tips=NULL, text.size=NULL,
+ sizebase=5, base.spacing = 0.02,
+ ladderize=FALSE, plot.margin=0.2, title=NULL,
+ treetheme=NULL, justify="jagged"){
+ ########################################
+ # Support mis-capitalization of reserved variable names in color, shape, size
+ # This helps, for instance, with backward-compatibility where "abundance"
+ # was the reserved variable name for mapping OTU abundance entries
+ fix_reserved_vars = function(aesvar){
+ aesvar <- gsub("^abundance[s]{0,}$", "Abundance", aesvar, ignore.case=TRUE)
+ aesvar <- gsub("^OTU[s]{0,}$", "OTU", aesvar, ignore.case=TRUE)
+ aesvar <- gsub("^taxa_name[s]{0,}$", "OTU", aesvar, ignore.case=TRUE)
+ aesvar <- gsub("^sample[s]{0,}$", "Sample", aesvar, ignore.case=TRUE)
+ return(aesvar)
+ }
+ if(!is.null(label.tips)){label.tips <- fix_reserved_vars(label.tips)}
+ if(!is.null(color)){color <- fix_reserved_vars(color)}
+ if(!is.null(shape)){shape <- fix_reserved_vars(shape)}
+ if(!is.null(size) ){size <- fix_reserved_vars(size)}
+ ########################################
+ if( is.null(phy_tree(physeq, FALSE)) ){
+ stop("There is no phylogenetic tree in the object you have provided.\n",
+ "Try phy_tree(physeq) to see for yourself.")
+ }
+ if(!inherits(physeq, "phyloseq")){
+ # If only a phylogenetic tree, then only tree available to overlay.
+ method <- "treeonly"
+ }
+ # Create the tree data.table
+ treeSegs <- tree_layout(phy_tree(physeq), ladderize=ladderize)
+ edgeMap = aes(x=xleft, xend=xright, y=y, yend=y)
+ vertMap = aes(x=x, xend=x, y=vmin, yend=vmax)
+ # Initialize phylogenetic tree.
+ # Naked, lines-only, unannotated tree as first layers. Edge (horiz) first, then vertical.
+ p = ggplot(data=treeSegs$edgeDT) + geom_segment(edgeMap) +
+ geom_segment(vertMap, data=treeSegs$vertDT)
+ # If no text.size given, calculate it from number of tips ("species", aka taxa)
+ # This is very fast. No need to worry about whether text is printed or not.
+ if(is.null(text.size)){
+ text.size <- manytextsize(ntaxa(physeq))
+ }
+ # Add the species labels to the right.
+ if(!is.null(label.tips) & method!="sampledodge"){
+ # If method is sampledodge, then labels are added to the right of points, later.
+ # Add labels layer to plotting object.
+ labelDT = treeSegs$edgeDT[!is.na(OTU), ]
+ if(!is.null(tax_table(object=physeq, errorIfNULL=FALSE))){
+ # If there is a taxonomy available, merge it with the label data.table
+ taxDT = data.table(tax_table(physeq), OTU=taxa_names(physeq), key="OTU")
+ # Merge with taxonomy.
+ labelDT = merge(x=labelDT, y=taxDT, by="OTU")
+ }
+ if(justify=="jagged"){
+ # Tip label aesthetic mapping.
+ # Aesthetics can be NULL, and that aesthetic gets ignored.
+ labelMap <- aes_string(x="xright", y="y", label=label.tips, color=color)
+ } else {
+ # The left-justified version of tip-labels.
+ labelMap <- aes_string(x="max(xright, na.rm=TRUE)", y="y", label=label.tips, color=color)
+ }
+ p <- p + geom_text(labelMap, data=labelDT, size=I(text.size), hjust=-0.1, na.rm=TRUE)
+ }
+ # Node label section.
+ #
+ # If no nodelabf ("node label function") given, ask internal function to pick one.
+ # Is NULL by default, meaning will dispatch to `howtolabnodes` to select function.
+ # For no node labels, the "dummy" function `nodeplotblank` will return tree plot
+ # object, p, as-is, unmodified.
+ if(is.null(nodelabf)){
+ nodelabf = howtolabnodes(physeq)
+ }
+ #### set node `y` as the mean of the vertical segment
+ # Use the provided/inferred node label function to add the node labels layer(s)
+ # Non-root nodes first
+ p = nodelabf(p, treeSegs$edgeDT[!is.na(label), ])
+ # Add root label (if present)
+ p = nodelabf(p, treeSegs$vertDT[!is.na(label), ])
+ # Theme specification
+ if(is.null(treetheme)){
+ # If NULL, then use the default tree theme.
+ treetheme <- theme(axis.ticks = element_blank(),
+ axis.title.x=element_blank(), axis.text.x=element_blank(),
+ axis.title.y=element_blank(), axis.text.y=element_blank(),
+ panel.background = element_blank(),
+ panel.grid.minor = element_blank(),
+ panel.grid.major = element_blank())
+ }
+ if(inherits(treetheme, "theme")){
+ # If a theme, add theme layer to plot.
+ # For all other cases, skip this, which will cause default theme to be used
+ p <- p + treetheme
+ }
+ # Optionally add a title to the plot
+ if(!is.null(title)){
+ p <- p + ggtitle(title)
+ }
+ if(method!="sampledodge"){
+ # If anything but a sampledodge tree, return now without further decorations.
+ return(p)
+ }
+ ########################################
+ # Sample Dodge Section
+ # Special words, c("Sample", "Abundance", "OTU")
+ # See psmelt()
+ ########################################
+ # Initialize the species/taxa/OTU data.table
+ dodgeDT = treeSegs$edgeDT[!is.na(OTU), ]
+ # Merge with psmelt() result, to make all co-variables available
+ dodgeDT = merge(x=dodgeDT, y=data.table(psmelt(physeq), key="OTU"), by="OTU")
+ if(justify=="jagged"){
+ # Remove 0 Abundance value entries now, not later, for jagged.
+ dodgeDT <- dodgeDT[Abundance > 0, ]
+ }
+ # Set key. Changes `dodgeDT` in place. OTU is first key, always.
+ if( !is.null(color) | !is.null(shape) | !is.null(size) ){
+ # If color, shape, or size is chosen, setkey by those as well
+ setkeyv(dodgeDT, cols=c("OTU", color, shape, size))
+ } else {
+ # Else, set key by OTU and sample name.
+ setkey(dodgeDT, OTU, Sample)
+ }
+ # Add sample-dodge horizontal adjustment index. In-place data.table assignment
+ dodgeDT[, h.adj.index := 1:length(xright), by=OTU]
+ # `base.spacing` is a user-input parameter.
+ # The sampledodge step size is based on this and the max `x` value
+ if(justify=="jagged"){
+ dodgeDT[, xdodge:=(xright + h.adj.index * base.spacing * max(xright, na.rm=TRUE))]
+ } else {
+ # Left-justified version, `xdodge` always starts at the max of all `xright` values.
+ dodgeDT[, xdodge := max(xright, na.rm=TRUE) + h.adj.index * base.spacing * max(xright, na.rm=TRUE)]
+ # zeroes removed only after all sample points have been mapped.
+ dodgeDT <- dodgeDT[Abundance > 0, ]
+ }
+ # The general tip-point map. Objects can be NULL, and that aesthetic gets ignored.
+ dodgeMap <- aes_string(x="xdodge", y="y", color=color, fill=color,
+ shape=shape, size=size)
+ p <- p + geom_point(dodgeMap, data=dodgeDT, na.rm=TRUE)
+ # Adjust point size transform
+ if( !is.null(size) ){
+ p <- p + scale_size_continuous(trans=log_trans(sizebase))
+ }
+ # Optionally-add abundance value label to each point.
+ # User controls this by the `min.abundance` parameter.
+ # A value of `Inf` implies no labels.
+ if( any(dodgeDT$Abundance >= min.abundance[1]) ){
+ pointlabdf = dodgeDT[Abundance>=min.abundance[1], ]
+ p <- p + geom_text(mapping=aes(xdodge, y, label=Abundance),
+ data=pointlabdf, size=text.size, na.rm=TRUE)
+ }
+ # If indicated, add the species labels to the right of dodged points.
+ if(!is.null(label.tips)){
+ # `tiplabDT` has only one row per tip, the farthest horizontal
+ # adjusted position (one for each taxa)
+ tiplabDT = dodgeDT
+ tiplabDT[, xfartiplab:=max(xdodge), by=OTU]
+ tiplabDT <- tiplabDT[h.adj.index==1, .SD, by=OTU]
+ if(!is.null(color)){
+ if(color %in% sample_variables(physeq, errorIfNULL=FALSE)){
+ color <- NULL
+ }
+ }
+ labelMap <- NULL
+ if(justify=="jagged"){
+ labelMap <- aes_string(x="xfartiplab", y="y", label=label.tips, color=color)
+ } else {
+ labelMap <- aes_string(x="max(xfartiplab, na.rm=TRUE)", y="y", label=label.tips, color=color)
+ }
+ # Add labels layer to plotting object.
+ p <- p + geom_text(labelMap, tiplabDT, size=I(text.size), hjust=-0.1, na.rm=TRUE)
+ }
+ # Plot margins.
+ # Adjust the tree graphic plot margins.
+ # Helps to manually ensure that graphic elements aren't clipped,
+ # especially when there are long tip labels.
+ min.x <- -0.01 # + dodgeDT[, min(c(xleft))]
+ max.x <- dodgeDT[, max(xright, na.rm=TRUE)]
+ if("xdodge" %in% names(dodgeDT)){
+ max.x <- dodgeDT[, max(xright, xdodge, na.rm=TRUE)]
+ }
+ if(plot.margin > 0){
+ max.x <- max.x * (1.0 + plot.margin)
+ }
+ p <- p + scale_x_continuous(limits=c(min.x, max.x))
+ return(p)
+}
+################################################################################
+# Adapted from NeatMap-package.
+# Vectorized for speed and simplicity, also only calculates theta and not r.
+#' @keywords internal
+RadialTheta <- function(pos){
+ pos = as(pos, "matrix")
+ xc = mean(pos[, 1])
+ yc = mean(pos[, 2])
+ theta = atan2((pos[, 2] - yc), (pos[, 1] - xc))
+ names(theta) <- rownames(pos)
+ return(theta)
+}
+################################################################################
+#' Create an ecologically-organized heatmap using ggplot2 graphics
+#'
+#' There are many useful examples of phyloseq heatmap graphics in the
+#' \href{http://joey711.github.io/phyloseq/plot_heatmap-examples}{phyloseq online tutorials}.
+#' In a 2010 article in BMC Genomics, Rajaram and Oono show describe an
+#' approach to creating a heatmap using ordination methods to organize the
+#' rows and columns instead of (hierarchical) cluster analysis. In many cases
+#' the ordination-based ordering does a much better job than h-clustering.
+#' An immediately useful example of their approach is provided in the NeatMap
+#' package for R. The NeatMap package can be used directly on the abundance
+#' table (\code{\link{otu_table-class}}) of phylogenetic-sequencing data, but
+#' the NMDS or PCA ordination options that it supports are not based on ecological
+#' distances. To fill this void, phyloseq provides the \code{plot_heatmap()}
+#' function as an ecology-oriented variant of the NeatMap approach to organizing
+#' a heatmap and build it using ggplot2 graphics tools.
+#' The \code{distance} and \code{method} arguments are the same as for the
+#' \code{\link{plot_ordination}} function, and support large number of
+#' distances and ordination methods, respectively, with a strong leaning toward
+#' ecology.
+#' This function also provides the options to re-label the OTU and sample
+#' axis-ticks with a taxonomic name and/or sample variable, respectively,
+#' in the hope that this might hasten your interpretation of the patterns
+#' (See the \code{sample.label} and \code{taxa.label} documentation, below).
+#' Note that this function makes no attempt to overlay hierarchical
+#' clustering trees on the axes, as hierarchical clustering is not used to
+#' organize the plot. Also note that each re-ordered axis repeats at the edge,
+#' and so apparent clusters at the far right/left or top/bottom of the
+#' heat-map may actually be the same. For now, the placement of this edge
+#' can be considered arbitrary, so beware of this artifact of this graphical
+#' representation. If you benefit from this phyloseq-specific implementation
+#' of the NeatMap approach, please cite both our packages/articles.
+#'
+#' This approach borrows heavily from the \code{heatmap1} function in the
+#' \code{NeatMap} package. Highly recommended, and we are grateful for their
+#' package and ideas, which we have adapted for our specific purposes here,
+#' but did not use an explicit dependency. At the time of the first version
+#' of this implementation, the NeatMap package depends on the rgl-package,
+#' which is not needed in phyloseq, at present. Although likely a transient
+#' issue, the rgl-package has some known installation issues that have further
+#' influenced to avoid making NeatMap a formal dependency (Although we love
+#' both NeatMap and rgl!).
+#'
+#' @param physeq (Required). The data, in the form of an instance of the
+#' \code{\link{phyloseq-class}}. This should be what you get as a result
+#' from one of the
+#' \code{\link{import}} functions, or any of the processing downstream.
+#' No data components beyond the \code{\link{otu_table}} are strictly
+#' necessary, though they may be useful if you want to re-label the
+#' axis ticks according to some observable or taxonomic rank, for instance,
+#' or if you want to use a \code{\link{UniFrac}}-based distance
+#' (in which case your \code{physeq} data would need to have a tree included).
+#'
+#' @param method (Optional).
+#' The ordination method to use for organizing the
+#' heatmap. A great deal of the usefulness of a heatmap graphic depends upon
+#' the way in which the rows and columns are ordered.
+#'
+#' @param distance (Optional). A character string.
+#' The ecological distance method to use in the ordination.
+#' See \code{\link{distance}}.
+#'
+#' @param sample.label (Optional). A character string.
+#' The sample variable by which you want to re-label the sample (horizontal) axis.
+#'
+#' @param taxa.label (Optional). A character string.
+#' The name of the taxonomic rank by which you want to
+#' re-label the taxa/species/OTU (vertical) axis.
+#' You can see available options in your data using
+#' \code{\link{rank_names}(physeq)}.
+#'
+#' @param low (Optional). A character string. An R color.
+#' See \code{?\link{colors}} for options support in R (there are lots).
+#' The color that represents the lowest non-zero value
+#' in the heatmap. Default is a dark blue color, \code{"#000033"}.
+#'
+#' @param high (Optional). A character string. An R color.
+#' See \code{\link{colors}} for options support in R (there are lots).
+#' The color that will represent the highest
+#' value in the heatmap. The default is \code{"#66CCFF"}.
+#' Zero-values are treated as \code{NA}, and set to \code{"black"}, to represent
+#' a background color.
+#'
+#' @param na.value (Optional). A character string. An R color.
+#' See \code{\link{colors}} for options support in R (there are lots).
+#' The color to represent what is essentially the background of the plot,
+#' the non-observations that occur as \code{NA} or
+#' \code{0} values in the abundance table. The default is \code{"black"}, which
+#' works well on computer-screen graphics devices, but may be a poor choice for
+#' printers, in which case you might want this value to be \code{"white"}, and
+#' reverse the values of \code{high} and \code{low}, above.
+#'
+#' @param trans (Optional). \code{"trans"}-class transformer-definition object.
+#' A numerical transformer to use in
+#' the continuous color scale. See \code{\link[scales]{trans_new}} for details.
+#' The default is \code{\link{log_trans}(4)}.
+#'
+#' @param max.label (Optional). Integer. Default is 250.
+#' The maximum number of labeles to fit on a given axis (either x or y).
+#' If number of taxa or samples exceeds this value,
+#' the corresponding axis will be stripped of any labels.
+#'
+#' This supercedes any arguments provided to
+#' \code{sample.label} or \code{taxa.label}.
+#' Make sure to increase this value if, for example,
+#' you want a special label
+#' for an axis that has 300 indices.
+#'
+#' @param title (Optional). Default \code{NULL}. Character string.
+#' The main title for the graphic.
+#'
+#' @param sample.order (Optional). Default \code{NULL}.
+#' Either a single character string matching
+#' one of the \code{\link{sample_variables}} in your data,
+#' or a character vector of \code{\link{sample_names}}
+#' in the precise order that you want them displayed in the heatmap.
+#' This overrides any ordination ordering that might be done
+#' with the \code{method}/\code{distance} arguments.
+#'
+#' @param taxa.order (Optional). Default \code{NULL}.
+#' Either a single character string matching
+#' one of the \code{\link{rank_names}} in your data,
+#' or a character vector of \code{\link{taxa_names}}
+#' in the precise order that you want them displayed in the heatmap.
+#' This overrides any ordination ordering that might be done
+#' with the \code{method}/\code{distance} arguments.
+#'
+#' @param first.sample (Optional). Default \code{NULL}.
+#' A character string matching one of the \code{\link{sample_names}}
+#' of your input data (\code{physeq}).
+#' It will become the left-most sample in the plot.
+#' For the ordination-based ordering (recommended),
+#' the left and right edges of the axes are adjaacent in a continuous ordering.
+#' Therefore, the choice of starting sample is meaningless and arbitrary,
+#' but it is aesthetically poor to have the left and right edge split
+#' a natural cluster in the data.
+#' This argument allows you to specify the left edge
+#' and thereby avoid cluster-splitting, emphasize a gradient, etc.
+#'
+#' @param first.taxa (Optional). Default \code{NULL}.
+#' A character string matching one of the \code{\link{taxa_names}}
+#' of your input data (\code{physeq}).
+#' This is equivalent to \code{first.sample} (above),
+#' but for the taxa/OTU indices, usually the vertical axis.
+#'
+#' @param ... (Optional). Additional parameters passed to \code{\link{ordinate}}.
+#'
+#' @return
+#' A heatmap plot, in the form of a \code{\link{ggplot}2} plot object,
+#' which can be further saved and modified.
+#'
+#' @references
+#' Because this function relies so heavily in principle, and in code, on some of the
+#' functionality in NeatMap, please site their article if you use this function
+#' in your work.
+#'
+#' Rajaram, S., & Oono, Y. (2010).
+#' NeatMap--non-clustering heat map alternatives in R. BMC Bioinformatics, 11, 45.
+#'
+#' Please see further examples in the
+#' \href{http://joey711.github.io/phyloseq/plot_heatmap-examples}{phyloseq online tutorials}.
+#'
+#' @importFrom vegan scores
+#'
+#' @importFrom scales log_trans
+#'
+#' @importFrom ggplot2 scale_fill_gradient
+#' @importFrom ggplot2 scale_y_discrete
+#' @importFrom ggplot2 scale_x_discrete
+#' @importFrom ggplot2 scale_fill_gradient
+#' @importFrom ggplot2 geom_raster
+#'
+#' @export
+#' @examples
+#' data("GlobalPatterns")
+#' gpac <- subset_taxa(GlobalPatterns, Phylum=="Crenarchaeota")
+#' # FYI, the base-R function uses a non-ecological ordering scheme,
+#' # but does add potentially useful hclust dendrogram to the sides...
+#' gpac <- subset_taxa(GlobalPatterns, Phylum=="Crenarchaeota")
+#' # Remove the nearly-empty samples (e.g. 10 reads or less)
+#' gpac = prune_samples(sample_sums(gpac) > 50, gpac)
+#' # Arbitrary order if method set to NULL
+#' plot_heatmap(gpac, method=NULL, sample.label="SampleType", taxa.label="Family")
+#' # Use ordination
+#' plot_heatmap(gpac, sample.label="SampleType", taxa.label="Family")
+#' # Use ordination for OTUs, but not sample-order
+#' plot_heatmap(gpac, sample.label="SampleType", taxa.label="Family", sample.order="SampleType")
+#' # Specifying both orders omits any attempt to use ordination. The following should be the same.
+#' p0 = plot_heatmap(gpac, sample.label="SampleType", taxa.label="Family", taxa.order="Phylum", sample.order="SampleType")
+#' p1 = plot_heatmap(gpac, method=NULL, sample.label="SampleType", taxa.label="Family", taxa.order="Phylum", sample.order="SampleType")
+#' #expect_equivalent(p0, p1)
+#' # Example: Order matters. Random ordering of OTU indices is difficult to interpret, even with structured sample order
+#' rando = sample(taxa_names(gpac), size=ntaxa(gpac), replace=FALSE)
+#' plot_heatmap(gpac, method=NULL, sample.label="SampleType", taxa.label="Family", taxa.order=rando, sample.order="SampleType")
+#' # # Select the edges of each axis.
+#' # First, arbitrary edge, ordering
+#' plot_heatmap(gpac, method=NULL)
+#' # Second, biological-ordering (instead of default ordination-ordering), but arbitrary edge
+#' plot_heatmap(gpac, taxa.order="Family", sample.order="SampleType")
+#' # Third, biological ordering, selected edges
+#' plot_heatmap(gpac, taxa.order="Family", sample.order="SampleType", first.taxa="546313", first.sample="NP2")
+#' # Fourth, add meaningful labels
+#' plot_heatmap(gpac, sample.label="SampleType", taxa.label="Family", taxa.order="Family", sample.order="SampleType", first.taxa="546313", first.sample="NP2")
+plot_heatmap <- function(physeq, method="NMDS", distance="bray",
+ sample.label=NULL, taxa.label=NULL,
+ low="#000033", high="#66CCFF", na.value="black", trans=log_trans(4),
+ max.label=250, title=NULL, sample.order=NULL, taxa.order=NULL,
+ first.sample=NULL, first.taxa=NULL, ...){
+
+ # User-override ordering
+ if( !is.null(taxa.order) & length(taxa.order)==1 ){
+ # Assume `taxa.order` is a tax_table variable. Use it for ordering.
+ rankcol = which(rank_names(physeq) %in% taxa.order)
+ taxmat = as(tax_table(physeq)[, 1:rankcol], "matrix")
+ taxa.order = apply(taxmat, 1, paste, sep="", collapse="")
+ names(taxa.order) <- taxa_names(physeq)
+ taxa.order = names(sort(taxa.order, na.last=TRUE))
+ }
+ if( !is.null(sample.order) & length(sample.order)==1 ){
+ # Assume `sample.order` is a sample variable. Use it for ordering.
+ sample.order = as.character(get_variable(physeq, sample.order))
+ names(sample.order) <- sample_names(physeq)
+ sample.order = names(sort(sample.order, na.last=TRUE))
+ }
+
+ if( !is.null(method) & (is.null(taxa.order) | is.null(sample.order)) ){
+ # Only attempt NeatMap if method is non-NULL & at least one of
+ # taxa.order and sample.order is not-yet defined.
+ # If both axes orders pre-defined by user, no need to perform ordination...
+
+ # Copy the approach from NeatMap by doing ordination on samples, but use
+ # phyloseq-wrapped distance/ordination procedures.
+ # Reorder by the angle in radial coordinates on the 2-axis plane.
+
+ # In case of NMDS iterations, capture the output so it isn't dumped on standard-out
+ junk = capture.output( ps.ord <- ordinate(physeq, method, distance, ...), file=NULL)
+ if( is.null(sample.order) ){
+ siteDF = NULL
+ # Only define new ord-based sample order if user did not define one already
+ trash1 = try({siteDF <- scores(ps.ord, choices = c(1, 2), display="sites", physeq=physeq)},
+ silent = TRUE)
+ if(inherits(trash1, "try-error")){
+ # warn that the attempt to get ordination coordinates for ordering failed.
+ warning("Attempt to access ordination coordinates for sample ordering failed.\n",
+ "Using default sample ordering.")
+ }
+ if(!is.null(siteDF)){
+ # If the score accession seemed to work, go ahead and replace sample.order
+ sample.order <- sample_names(physeq)[order(RadialTheta(siteDF))]
+ }
+ }
+
+ if( is.null(taxa.order) ){
+ # re-order species/taxa/OTUs, if possible,
+ # and only if user did not define an order already
+ specDF = NULL
+ trash2 = try({specDF <- scores(ps.ord, choices=c(1, 2), display="species", physeq=physeq)},
+ silent = TRUE)
+ if(inherits(trash2, "try-error")){
+ # warn that the attempt to get ordination coordinates for ordering failed.
+ warning("Attempt to access ordination coordinates for feature/species/taxa/OTU ordering failed.\n",
+ "Using default feature/species/taxa/OTU ordering.")
+ }
+ if(!is.null(specDF)){
+ # If the score accession seemed to work, go ahead and replace sample.order
+ taxa.order = taxa_names(physeq)[order(RadialTheta(specDF))]
+ }
+ }
+ }
+
+ # Now that index orders are determined, check/assign edges of axes, if specified
+ if( !is.null(first.sample) ){
+ sample.order = chunkReOrder(sample.order, first.sample)
+ }
+ if( !is.null(first.taxa) ){
+ taxa.order = chunkReOrder(taxa.order, first.taxa)
+ }
+
+ # melt physeq with the standard user-accessible data melting function
+ # for creating plot-ready data.frames, psmelt.
+ # This is also used inside some of the other plot_* functions.
+ adf = psmelt(physeq)
+ # Coerce the main axis variables to character.
+ # Will reset them to factor if re-ordering is needed.
+ adf$OTU = as(adf$OTU, "character")
+ adf$Sample = as(adf$Sample, "character")
+ if( !is.null(sample.order) ){
+ # If sample-order is available, coerce to factor with special level-order
+ adf$Sample = factor(adf$Sample, levels=sample.order)
+ } else {
+ # Make sure it is a factor, but with default order/levels
+ adf$Sample = factor(adf$Sample)
+ }
+ if( !is.null(taxa.order) ){
+ # If OTU-order is available, coerce to factor with special level-order
+ adf$OTU = factor(adf$OTU, levels=taxa.order)
+ } else {
+ # Make sure it is a factor, but with default order/levels
+ adf$OTU = factor(adf$OTU)
+ }
+
+ ## Now the plotting part
+ # Initialize p.
+ p = ggplot(adf, aes(x = Sample, y = OTU, fill=Abundance)) +
+ geom_raster()
+
+ # # Don't render labels if more than max.label
+ # Samples
+ if( nsamples(physeq) <= max.label ){
+ # Add resize layer for samples if there are fewer than max.label
+ p <- p + theme(
+ axis.text.x = element_text(
+ size=manytextsize(nsamples(physeq), 4, 30, 12),
+ angle=-90, vjust=0.5, hjust=0
+ )
+ )
+ } else {
+ # Remove the labels from any rendering.
+ p = p + scale_x_discrete("Sample", labels="")
+ }
+ # OTUs
+ if( ntaxa(physeq) <= max.label ){
+ # Add resize layer for OTUs if there are fewer than max.label
+ p <- p + theme(
+ axis.text.y = element_text(
+ size=manytextsize(ntaxa(physeq), 4, 30, 12)
+ )
+ )
+ } else {
+ # Remove the labels from any rendering.
+ p = p + scale_y_discrete("OTU", labels="")
+ }
+
+ # # Axis Relabeling (Skipped if more than max.label):
+ # Re-write sample-labels to some sample variable...
+ if( !is.null(sample.label) & nsamples(physeq) <= max.label){
+ # Make a sample-named char-vector of the values for sample.label
+ labvec = as(get_variable(physeq, sample.label), "character")
+ names(labvec) <- sample_names(physeq)
+ if( !is.null(sample.order) ){
+ # Re-order according to sample.order
+ labvec = labvec[sample.order]
+ }
+ # Replace any NA (missing) values with "" instead. Avoid recycling labels.
+ labvec[is.na(labvec)] <- ""
+ # Add the sample.label re-labeling layer
+ p = p + scale_x_discrete(sample.label, labels=labvec)
+ }
+ if( !is.null(taxa.label) & ntaxa(physeq) <= max.label){
+ # Make a OTU-named vector of the values for taxa.label
+ labvec <- as(tax_table(physeq)[, taxa.label], "character")
+ names(labvec) <- taxa_names(physeq)
+ if( !is.null(taxa.order) ){
+ # Re-order according to taxa.order
+ labvec <- labvec[taxa.order]
+ }
+ # Replace any NA (missing) values with "" instead. Avoid recycling labels.
+ labvec[is.na(labvec)] <- ""
+ # Add the taxa.label re-labeling layer
+ p = p + scale_y_discrete(taxa.label, labels=labvec)
+ }
+
+ # Color scale transformations
+ if( !is.null(trans) ){
+ p = p + scale_fill_gradient(low=low, high=high, trans=trans, na.value=na.value)
+ } else {
+ p = p + scale_fill_gradient(low=low, high=high, na.value=na.value)
+ }
+
+ # Optionally add a title to the plot
+ if( !is.null(title) ){
+ p = p + ggtitle(title)
+ }
+
+ return(p)
+}
+################################################################################
+#' Chunk re-order a vector so that specified newstart is first.
+#'
+#' Different than relevel.
+#'
+#' @keywords internal
+#' @examples
+#' # Typical use-case
+#' # chunkReOrder(1:10, 5)
+#' # # Default is to not modify the vector
+#' # chunkReOrder(1:10)
+#' # # Another example not starting at 1
+#' # chunkReOrder(10:25, 22)
+#' # # Should silently ignore the second element of `newstart`
+#' # chunkReOrder(10:25, c(22, 11))
+#' # # Should be able to handle `newstart` being the first argument already
+#' # # without duplicating the first element at the end of `x`
+#' # chunkReOrder(10:25, 10)
+#' # all(chunkReOrder(10:25, 10) == 10:25)
+#' # # This is also the default
+#' # all(chunkReOrder(10:25) == 10:25)
+#' # # An example with characters
+#' # chunkReOrder(LETTERS, "G")
+#' # chunkReOrder(LETTERS, "B")
+#' # chunkReOrder(LETTERS, "Z")
+#' # # What about when `newstart` is not in `x`? Return x as-is, throw warning.
+#' # chunkReOrder(LETTERS, "g")
+chunkReOrder = function(x, newstart = x[[1]]){
+ pivot = match(newstart[1], x, nomatch = NA)
+ # If pivot `is.na`, throw warning, return x as-is
+ if(is.na(pivot)){
+ warning("The `newstart` argument was not in `x`. Returning `x` without reordering.")
+ newx = x
+ } else {
+ newx = c(tail(x, {length(x) - pivot + 1}), head(x, pivot - 1L))
+ }
+ return(newx)
+}
+################################################################################
+#' Create a ggplot summary of gap statistic results
+#'
+#' @param clusgap (Required).
+#' An object of S3 class \code{"clusGap"}, basically a list with components.
+#' See the \code{\link[cluster]{clusGap}} documentation for more details.
+#' In most cases this will be the output of \code{\link{gapstat_ord}},
+#' or \code{\link[cluster]{clusGap}} if you called it directly.
+#'
+#' @param title (Optional). Character string.
+#' The main title for the graphic.
+#' Default is \code{"Gap Statistic results"}.
+#'
+#' @return
+#' A \code{\link[ggplot2]{ggplot}} plot object.
+#' The rendered graphic should be a plot of the gap statistic score
+#' versus values for \code{k}, the number of clusters.
+#'
+#' @seealso
+#' \code{\link{gapstat_ord}}
+#'
+#' \code{\link[cluster]{clusGap}}
+#'
+#' \code{\link[ggplot2]{ggplot}}
+#'
+#' @importFrom ggplot2 geom_errorbar
+#' @importFrom ggplot2 geom_line
+#'
+#' @export
+#'
+#' @examples
+#' # Load and process data
+#' data("soilrep")
+#' soilr = rarefy_even_depth(soilrep, rngseed=888)
+#' print(soilr)
+#' sample_variables(soilr)
+#' # Ordination
+#' sord = ordinate(soilr, "DCA")
+#' # Gap Statistic
+#' gs = gapstat_ord(sord, axes=1:4, verbose=FALSE)
+#' # Evaluate results with plots, etc.
+#' plot_scree(sord)
+#' plot_ordination(soilr, sord, color="Treatment")
+#' plot_clusgap(gs)
+#' print(gs, method="Tibs2001SEmax")
+#' # Non-ordination example, use cluster::clusGap function directly
+#' library("cluster")
+#' pam1 = function(x, k){list(cluster = pam(x, k, cluster.only=TRUE))}
+#' gs.pam.RU = clusGap(ruspini, FUN = pam1, K.max = 8, B = 60)
+#' gs.pam.RU
+#' plot(gs.pam.RU, main = "Gap statistic for the 'ruspini' data")
+#' mtext("k = 4 is best .. and k = 5 pretty close")
+#' plot_clusgap(gs.pam.RU)
+plot_clusgap = function(clusgap, title="Gap Statistic results"){
+ gstab = data.frame(clusgap$Tab, k = 1:nrow(clusgap$Tab))
+ p = ggplot(gstab, aes(k, gap)) + geom_line() + geom_point(size = 5)
+ p = p + geom_errorbar(aes(ymax = gap + SE.sim, ymin = gap - SE.sim))
+ p = p + ggtitle(title)
+ return(p)
+}
+################################################################################
\ No newline at end of file
diff --git a/R/sampleData-class.R b/R/sampleData-class.R
new file mode 100644
index 0000000..22b5396
--- /dev/null
+++ b/R/sampleData-class.R
@@ -0,0 +1,149 @@
+################################################################################
+#' Build or access sample_data.
+#'
+#' This is the suggested method for both constructing and accessing a table
+#' of sample-level variables (\code{\link{sample_data-class}}),
+#' which in the \code{\link{phyloseq-package}} is represented as a special
+#' extension of the \code{\link{data.frame-class}}.
+#' When the
+#' argument is a \code{\link{data.frame}}, \code{sample_data} will create
+#' a sample_data-class object.
+#' In this case, the rows should be named to match the
+#' \code{\link{sample_names}} of the other objects to which it will ultimately be paired.
+#' Alternatively, if the first argument is an experiment-level (\code{\link{phyloseq-class}})
+#' object, then the corresponding \code{sample_data} is returned.
+#' Like other accessors (see See Also, below), the default behavior of this method
+#' is to stop with an
+#' error if \code{object} is a \code{phyloseq-class} but does not
+#' contain a \code{sample_data}.
+#'
+#' @usage sample_data(object, errorIfNULL=TRUE)
+#'
+#' @param object (Required). A \code{\link{data.frame-class}},
+#' or a \code{\link{phyloseq-class}} object.
+#'
+#' @param errorIfNULL (Optional). Logical. Should the accessor stop with
+#' an error if the slot is empty (\code{NULL})? Default \code{TRUE}.
+#'
+#' @return A \code{\link{sample_data-class}} object
+#' representing the sample variates of an experiment.
+#'
+#' @seealso \code{\link{phy_tree}}, \code{\link{tax_table}}, \code{\link{otu_table}}
+#' \code{\link{phyloseq}}, \code{\link{merge_phyloseq}}
+#'
+#' @aliases sample_data
+#'
+#' @rdname sample_data-methods
+#' @docType methods
+#' @export
+#'
+#' @examples #
+#' data(soilrep)
+#' head(sample_data(soilrep))
+setGeneric("sample_data", function(object, errorIfNULL=TRUE) standardGeneric("sample_data"))
+#' @rdname sample_data-methods
+#' @aliases sample_data,ANY-method
+setMethod("sample_data", "ANY", function(object, errorIfNULL=TRUE){
+ access(object, "sam_data", errorIfNULL)
+})
+# constructor; for creating sample_data from a data.frame
+#' @rdname sample_data-methods
+#' @aliases sample_data,data.frame-method
+setMethod("sample_data", "data.frame", function(object){
+ # Make sure there are no phantom levels in categorical variables
+ object <- reconcile_categories(object)
+
+ # instantiate first to check validity
+ SM <- new("sample_data", object)
+
+ # Want dummy samples index names if missing
+ if( all(rownames(SM) == as.character(1:nrow(SM))) ){
+ rownames(SM) <- paste("sa", 1:nrow(SM), sep="")
+ }
+ return(SM)
+})
+################################################################################
+#' Cleans absent levels in sample_data/data.frame.
+#'
+#' This is used internally by the builder method, \code{\link{sample_data}}, to
+#' ensure that the factors describing categorical variables in a data.frame or
+#' sample_data object are free of extra levels that can plague downstream plots
+#' analysis.
+#'
+#' @usage reconcile_categories(DFSM)
+#'
+#' @param DFSM (Required). A \code{data.frame} or \code{sample_data} object that needs to be cleaned.
+#'
+#' @return A single \code{data.frame} object. Even if the input argument is a \code{sample_data},
+#' the return is a \code{data.frame}. Because this is intended to be used internally by
+#' the builder method, it cannot also call the builder function to re-build
+#' the cleaned \code{sample_data}.
+#'
+#' @keywords internal
+#'
+#' @examples
+#' # # # data(GlobalPatterns)
+#' # # # SM <- sample_data(GlobalPatterns)
+#' # # # DF <- data.frame(SM)
+#' # # # DF <- data.frame(DF, col1=1:nrow(DF), col2=paste(1:nrow(DF), "t", sep=""))
+#' # # # DF <- reconcile_categories(DF)
+#' # # # SM <- sample_data(reconcile_categories(SM))
+#' # # # sapply(DF, class)
+#' # # # sapply(SM, class)
+reconcile_categories <- function(DFSM){
+ DF = as(DFSM, "data.frame")
+ return(droplevels(DF))
+}
+################################################################################
+#' Subset samples by sample_data expression
+#'
+#' This is a convenience wrapper around the \code{\link{subset}} function.
+#' It is intended to allow subsetting complex experimental objects with one
+#' function call.
+#' Subsetting is based on an expression for which the context first includes
+#' the variables contained in \code{\link{sample_data}}.
+#' The \code{samples} retained in the dataset is equivalent to
+#' \code{x[subset & !is.na(subset)]}, where \code{x} is the vector of sample IDs
+#' and \code{subset} is the logical that results from your subsetting expression.
+#' This is important to keep in mind, as users are often unaware that this
+#' subsetting step also removes/omits samples that have a missing value, \code{NA},
+#' somewhere in the expression.
+#'
+#' @usage subset_samples(physeq, ...)
+#'
+#' @param physeq A \code{\link{sample_data-class}}, or a \code{\link{phyloseq-class}}
+#' object with a
+#' \code{sample_data}. If the \code{sample_data} slot is missing in \code{physeq},
+#' then \code{physeq} will be returned as-is, and a warning will be printed to screen.
+#'
+#' @param ... The subsetting expression that should be applied to the
+#' \code{sample_data}. This is passed on to \code{\link{subset}}, see its
+#' documentation for more details.
+#'
+#' @return A subsetted object with the same class as \code{physeq}.
+#'
+#' @seealso \code{\link{subset_species}}
+#'
+#' @export
+#' @rdname subset_samples-methods
+#' @docType methods
+#'
+#' @examples
+#' # data(GlobalPatterns)
+#' # subset_samples(GlobalPatterns, SampleType=="Ocean")
+subset_samples <- function(physeq, ...){
+ if( is.null(sample_data(physeq)) ){
+ cat("Nothing subset. No sample_data in physeq.\n")
+ return(physeq)
+ } else {
+ oldDF <- as(sample_data(physeq), "data.frame")
+ newDF <- subset(oldDF, ...)
+ if( class(physeq) == "sample_data" ){
+ return(sample_data(newDF))
+ } else {
+ sample_data(physeq) <- sample_data(newDF)
+ return(physeq)
+ }
+ }
+}
+################################################################################
diff --git a/R/show-methods.R b/R/show-methods.R
new file mode 100644
index 0000000..d58bd9d
--- /dev/null
+++ b/R/show-methods.R
@@ -0,0 +1,82 @@
+############################################################################
+#' @rdname show-methods
+setMethod("show", "otu_table", function(object){
+ # print otu_table (always there).
+ cat(paste("OTU Table: [", ntaxa(object), " taxa and ",
+ nsamples(object), " samples]", sep = ""), fill = TRUE)
+ if( taxa_are_rows(object) ){
+ cat(" taxa are rows", fill=TRUE)
+ } else {
+ cat(" taxa are columns", fill=TRUE)
+ }
+ show(as(object, "matrix"))
+})
+############################################################################
+#' @rdname show-methods
+setMethod("show", "sample_data", function(object){
+ cat(paste("Sample Data: [", dim(sample_data(object))[1], " samples by ",
+ dim(sample_data(object))[2],
+ " sample variables]:", sep = ""),
+ fill = TRUE)
+ show(as(object, "data.frame"))
+})
+############################################################################
+#' @rdname show-methods
+setMethod("show", "taxonomyTable", function(object){
+ cat(paste("Taxonomy Table: [", dim(object)[1], " taxa by ",
+ dim(object)[2],
+ " taxonomic ranks]:", sep = ""),
+ fill = TRUE)
+ show(as(object, "matrix"))
+})
+############################################################################
+#' method extensions to show for phyloseq objects.
+#'
+#' See the general documentation of \code{\link[methods]{show}} method for
+#' expected behavior.
+#'
+#' @seealso \code{\link[methods]{show}}
+#'
+#' @inheritParams methods::show
+#' @export
+#' @rdname show-methods
+#' @examples
+#' # data(GlobalPatterns)
+#' # show(GlobalPatterns)
+#' # GlobalPatterns
+setMethod("show", "phyloseq", function(object){
+ cat("phyloseq-class experiment-level object", fill=TRUE)
+ # print otu_table (always there).
+ cat(paste("otu_table() OTU Table: [ ", ntaxa(otu_table(object)), " taxa and ",
+ nsamples(otu_table(object)), " samples ]", sep = ""), fill = TRUE)
+
+ # print Sample Data if there
+ if(!is.null(sample_data(object, FALSE))){
+ cat(paste("sample_data() Sample Data: [ ", dim(sample_data(object))[1], " samples by ",
+ dim(sample_data(object))[2],
+ " sample variables ]", sep = ""), fill = TRUE)
+ }
+
+ # print tax Tab if there
+ if(!is.null(tax_table(object, FALSE))){
+ cat(paste("tax_table() Taxonomy Table: [ ", dim(tax_table(object))[1], " taxa by ",
+ dim(tax_table(object))[2],
+ " taxonomic ranks ]", sep = ""), fill = TRUE)
+ }
+
+ # print tree if there
+ if(!is.null(phy_tree(object, FALSE))){
+ cat(paste("phy_tree() Phylogenetic Tree: [ ", ntaxa(phy_tree(object)), " tips and ",
+ phy_tree(object)$Nnode,
+ " internal nodes ]", sep = ""),
+ fill = TRUE
+ )
+ }
+
+ # print refseq summary if there
+ if(!is.null(refseq(object, FALSE))){
+ cat(paste("refseq() ", class(refseq(object))[1], ": [ ", ntaxa(refseq(object)), " reference sequences ]", sep = ""), fill=TRUE)
+ }
+
+})
+############################################################################
diff --git a/R/taxonomyTable-class.R b/R/taxonomyTable-class.R
new file mode 100644
index 0000000..ce77d01
--- /dev/null
+++ b/R/taxonomyTable-class.R
@@ -0,0 +1,132 @@
+################################################################################
+#' Build or access the taxonomyTable.
+#'
+#' This is the suggested method for both constructing and accessing a table of
+#' taxonomic names, organized with ranks as columns (\code{\link{taxonomyTable-class}}).
+#' When the argument is a character matrix, tax_table() will create and return a
+#' \code{\link{taxonomyTable-class}} object.
+#' In this case, the rows should be named to match the
+#' \code{species.names} of the other objects to which it will ultimately be paired.
+#' Alternatively, if the first argument is an experiment-level (\code{\link{phyloseq-class}})
+#' object, then the corresponding \code{taxonomyTable} is returned.
+#' Like other accessors (see See Also, below), the default behavior of this method
+#' is to stop with an
+#' error if \code{object} is a \code{phyloseq-class} but does not
+#' contain a \code{taxonomyTable}.
+#'
+#' @usage tax_table(object, errorIfNULL=TRUE)
+#'
+#' @param object An object among the set of classes defined by the phyloseq
+#' package that contain taxonomyTable.
+#'
+#' @param errorIfNULL (Optional). Logical. Should the accessor stop with
+#' an error if the slot is empty (\code{NULL})? Default \code{TRUE}.
+#'
+#' @return A \code{\link{taxonomyTable-class}} object.
+#' It is either grabbed from the relevant slot
+#' if \code{object} is complex, or built anew if \code{object} is a
+#' character matrix representing the taxonomic classification of
+#' species in the experiment.
+#'
+#' @seealso \code{\link{phy_tree}}, \code{\link{sample_data}}, \code{\link{otu_table}}
+#' \code{\link{phyloseq}}, \code{\link{merge_phyloseq}}
+#'
+#' @rdname tax_table-methods
+#' @docType methods
+#' @export
+#'
+#' @examples #
+#' # tax1 <- tax_table(matrix("abc", 30, 8))
+#' # data(GlobalPatterns)
+#' # tax_table(GlobalPatterns)
+setGeneric("tax_table", function(object, errorIfNULL=TRUE) standardGeneric("tax_table"))
+#' @rdname tax_table-methods
+#' @aliases tax_table,ANY-method
+setMethod("tax_table", "ANY", function(object, errorIfNULL=TRUE){
+ access(object, "tax_table", errorIfNULL)
+})
+# Constructor; for creating taxonomyTable from a matrix.
+#' @rdname tax_table-methods
+#' @aliases tax_table,matrix-method
+setMethod("tax_table", "matrix", function(object){
+ # Want dummy species/taxa index names if missing
+ if(is.null(rownames(object))){
+ rownames(object) <- paste("sp", 1:nrow(object), sep="")
+ }
+ if(is.null(colnames(object))){
+ colnames(object) <- paste("ta", 1:ncol(object), sep="")
+ }
+ # instantiate as taxonomyTable
+ return(new("taxonomyTable", object))
+})
+# Constructor; coerce to matrix, then pass on for creating taxonomyTable.
+#' @rdname tax_table-methods
+#' @aliases tax_table,data.frame-method
+setMethod("tax_table", "data.frame", function(object){
+ # Warn first
+ text = "Coercing from data.frame class to character matrix \n"
+ text = paste0(text, "prior to building taxonomyTable. \n")
+ text = paste0(text, "This could introduce artifacts. \n")
+ text = paste0(text, "Check your taxonomyTable, or coerce to matrix manually.")
+ warning(text)
+ # Coerce everything to a matrix, then char-vector, then back to matrix.
+ TT <- matrix(as(as(object, "matrix"), "character"),
+ nrow=nrow(object),
+ ncol=ncol(object)
+ )
+ # Pass on to matrix-method.
+ tax_table(TT)
+})
+################################################################################
+#' Subset species by taxonomic expression
+#'
+#' This is a convenience wrapper around the \code{\link{subset}} function.
+#' It is intended to speed subsetting complex experimental objects with one
+#' function call. In the case of \code{subset_taxa}, the subsetting will be
+#' based on an expression related to the columns and values within the
+#' \code{tax_table} (\code{taxonomyTable} component) slot of \code{physeq}.
+#' The \code{OTUs} retained in the dataset is equivalent to
+#' \code{x[subset & !is.na(subset)]}, where \code{x} is the vector of OTU IDs
+#' and \code{subset} is the logical that results from your subsetting expression.
+#' This is important to keep in mind, as users are often unaware that this
+#' subsetting step also removes/omits OTUs that have a missing value result, \code{NA},
+#' somewhere in the expression.
+#'
+#' @usage subset_taxa(physeq, ...)
+#'
+#' @param physeq A \code{\link{taxonomyTable-class}}, or \code{\link{phyloseq-class}} that contains a
+#' taxonomyTable. If the \code{tax_table} slot is missing in \code{physeq}, then \code{physeq}
+#' will be returned as-is and a warning will be printed to screen.
+#'
+#' @param ... The subsetting expression that should be applied to the
+#' \code{taxonomyTable}. This is passed on to \code{\link{subset}}, and more
+#' details and examples about how it functions can be found in its documentation.
+#'
+#' @return A subsetted object with the same class as \code{physeq}.
+#'
+#' @seealso \code{\link{subset_samples}}
+#'
+#' @rdname subset_taxa-methods
+#' @docType methods
+#' @export
+#'
+#' @examples
+#' ## ex3 <- subset_taxa(GlobalPatterns, Phylum=="Bacteroidetes")
+subset_taxa <- function(physeq, ...){
+ if( is.null(tax_table(physeq)) ){
+ cat("Nothing subset. No taxonomyTable in physeq.\n")
+ return(physeq)
+ } else {
+ oldMA <- as(tax_table(physeq), "matrix")
+ oldDF <- data.frame(oldMA)
+ newDF <- subset(oldDF, ...)
+ newMA <- as(newDF, "matrix")
+ if( inherits(physeq, "taxonomyTable") ){
+ return(tax_table(newMA))
+ } else {
+ tax_table(physeq) <- tax_table(newMA)
+ return(physeq)
+ }
+ }
+}
+################################################################################
diff --git a/R/transform_filter-methods.R b/R/transform_filter-methods.R
new file mode 100644
index 0000000..c44e4c0
--- /dev/null
+++ b/R/transform_filter-methods.R
@@ -0,0 +1,1187 @@
+################################################################################
+# Function to create subsampled dataset
+# in which each sample has same number of total observations/counts/reads
+# Note that the subsampling is random, so some noise is introduced making the
+# relative abundances slightly different
+################################################################################
+#' Resample an OTU table such that all samples have the same library size.
+#'
+#' Please note that the authors of phyloseq do not advocate using this
+#' as a normalization procedure, despite its recent popularity.
+#' Our justifications for using alternative approaches to address
+#' disparities in library sizes have been made available as
+#' \href{http://dx.plos.org/10.1371/journal.pcbi.1003531}{an article in PLoS Computational Biology}.
+#' See \code{\link{phyloseq_to_deseq2}} for a recommended alternative to rarefying
+#' directly supported in the phyloseq package, as well as
+#' \href{http://joey711.github.io/waste-not-supplemental/}{the supplemental materials for the PLoS-CB article}
+#' and \href{http://joey711.github.io/phyloseq-extensions}{the phyloseq extensions repository on GitHub}.
+#' Nevertheless, for comparison and demonstration, the rarefying procedure is implemented
+#' here in good faith and with options we hope are useful.
+#' This function uses the standard R \code{\link{sample}} function to
+#' resample from the abundance values
+#' in the \code{\link{otu_table}} component of the first argument,
+#' \code{physeq}.
+#' Often one of the major goals of this procedure is to achieve parity in
+#' total number of counts between samples, as an alternative to other formal
+#' normalization procedures, which is why a single value for the
+#' \code{sample.size} is expected.
+#' This kind of resampling can be performed with and without replacement,
+#' with replacement being the more computationally-efficient, default setting.
+#' See the \code{replace} parameter documentation for more details.
+#' We recommended that you explicitly select a random number generator seed
+#' before invoking this function, or, alternatively, that you
+#' explicitly provide a single positive integer argument as \code{rngseed}.
+#'
+#' This approach is sometimes mistakenly called ``rarefaction'', which
+#' \href{http://en.wikipedia.org/wiki/Rarefaction}{in physics refers to a form of wave decompression;}
+#' but in this context, ecology, the term refers to a
+#' \href{http://en.wikipedia.org/wiki/Rarefaction_(ecology)}{repeated sampling procedure to assess species richness},
+#' first proposed in 1968 by Howard Sanders.
+#' In contrast, the procedure implemented here is used as an \emph{ad hoc} means to
+#' normalize microbiome counts that have
+#' resulted from libraries of widely-differing sizes.
+#' Here we have intentionally adopted an alternative
+#' name, \code{rarefy}, that has also been used recently
+#' to describe this process
+#' and, to our knowledge, not previously used in ecology.
+#'
+#' Make sure to use \code{\link{set.seed}} for exactly-reproducible results
+#' of the random subsampling.
+#'
+#' @param physeq (Required). A \code{\link{phyloseq-class}} object that you
+#' want to trim/filter.
+#'
+#' @param sample.size (Optional). A single integer value equal to the number
+#' of reads being simulated, also known as the depth,
+#' and also equal to each value returned by \code{\link{sample_sums}}
+#' on the output.
+#'
+#' @param rngseed (Optional). A single integer value passed to
+#' \code{\link{set.seed}}, which is used to fix a seed for reproducibly
+#' random number generation (in this case, reproducibly random subsampling).
+#' The default value is \code{711}.
+#' If set to \code{FALSE}, then no fiddling with the RNG seed is performed,
+#' and it is up to the user to appropriately call \code{\link{set.seed}}
+#' beforehand to achieve reproducible results.
+#'
+#' @param replace (Optional). Logical. Whether to sample with replacement
+#' (\code{TRUE}) or without replacement (\code{FALSE}).
+#' The default is with replacement (\code{replace=TRUE}).
+#' Two implications to consider are that
+#' (1) sampling with replacement is faster and more memory efficient
+#' as currently implemented; and
+#' (2), sampling with replacement means that there is a chance that the
+#' number of reads for a given OTU in a given sample could be larger
+#' than the original count value, as opposed to sampling without replacement
+#' where the original count value is the maximum possible.
+#' Prior to phyloseq package version number \code{1.5.20},
+#' this parameter did not exist and sampling with replacement was the only
+#' random subsampling implemented in the \code{rarefy_even_depth} function.
+#' Note that this default behavior was selected for computational efficiency,
+#' but differs from analogous functions in related packages
+#' (e.g. subsampling in QIIME).
+#'
+#' @param trimOTUs (Optional). \code{\link{logical}(1)}.
+#' Whether to trim OTUs
+#' from the dataset that are no longer observed in any sample
+#' (have a count of zero in every sample).
+#' The number of OTUs trimmed, if any, is printed to
+#' standard out as a reminder.
+#'
+#' @param verbose (Optional). Logical. Default is \code{TRUE}.
+#' If \code{TRUE}, extra non-warning, non-error messages are printed
+#' to standard out, describing steps in the rarefying process,
+#' the OTUs and samples removed, etc. This can be useful the
+#' first few times the function is executed, but can be set
+#' to \code{FALSE} as-needed once behavior has been verified
+#' as expected.
+#'
+#' @return An object of class \code{phyloseq}.
+#' Only the \code{otu_table} component is modified.
+#'
+#' @seealso
+#' \code{\link{sample}}
+#'
+#' \code{\link{set.seed}}
+#'
+#' @export
+#'
+#' @examples
+#' # Test with esophagus dataset
+#' data("esophagus")
+#' esorepT = rarefy_even_depth(esophagus, replace=TRUE)
+#' esorepF = rarefy_even_depth(esophagus, replace=FALSE)
+#' sample_sums(esophagus)
+#' sample_sums(esorepT)
+#' sample_sums(esorepF)
+#' ## NRun Manually: Too slow!
+#' # data("GlobalPatterns")
+#' # GPrepT = rarefy_even_depth(GlobalPatterns, 1E5, replace=TRUE)
+#' ## Actually just this one is slow
+#' # system.time(GPrepF <- rarefy_even_depth(GlobalPatterns, 1E5, replace=FALSE))
+rarefy_even_depth <- function(physeq, sample.size=min(sample_sums(physeq)),
+ rngseed=FALSE, replace=TRUE, trimOTUs=TRUE, verbose=TRUE){
+
+ if( as(rngseed, "logical") ){
+ # Now call the set.seed using the value expected in phyloseq
+ set.seed(rngseed)
+ if(verbose){
+ # Print to screen this value
+ message("`set.seed(", rngseed, ")` was used to initialize repeatable random subsampling.")
+ message("Please record this for your records so others can reproduce.")
+ message("Try `set.seed(", rngseed,"); .Random.seed` for the full vector", sep="")
+ message("...")
+ }
+ } else if(verbose){
+ message("You set `rngseed` to FALSE. Make sure you've set & recorded\n",
+ " the random seed of your session for reproducibility.\n",
+ "See `?set.seed`\n")
+ message("...")
+ }
+
+ # Make sure sample.size is of length 1.
+ if( length(sample.size) > 1 ){
+ warning("`sample.size` had more than one value. ",
+ "Using only the first. \n ... \n")
+ sample.size <- sample.size[1]
+ }
+
+ if( sample.size <= 0 ){
+ stop("sample.size less than or equal to zero. ",
+ "Need positive sample size to work.")
+ }
+
+ # Instead of warning, expected behavior now is to prune samples
+ # that have fewer reads than `sample.size`
+ if( min(sample_sums(physeq)) < sample.size ){
+ rmsamples = sample_names(physeq)[sample_sums(physeq) < sample.size]
+ if(verbose){
+ message(length(rmsamples), " samples removed",
+ "because they contained fewer reads than `sample.size`.")
+ message("Up to first five removed samples are: \n")
+ message(rmsamples[1:min(5, length(rmsamples))], sep="\t")
+ message("...")
+ }
+ # Now done with notifying user of pruning, actually prune.
+ physeq = prune_samples(setdiff(sample_names(physeq), rmsamples), physeq)
+ }
+ # initialize the subsamples phyloseq instance, newsub
+ newsub <- physeq
+ # enforce orientation as species-are-rows, for assignment
+ if(!taxa_are_rows(newsub)){newsub <- t(newsub)}
+ # apply through each sample, and replace
+ newotu <- apply(otu_table(newsub), 2, rarefaction_subsample,
+ sample.size=sample.size, replace=replace)
+ # Add OTU names to the row indices
+ rownames(newotu) <- taxa_names(physeq)
+ # replace the otu_table.
+ otu_table(newsub) <- otu_table(newotu, TRUE)
+ if(trimOTUs){
+ # Check for and remove empty OTUs
+ # 1. Notify user of empty OTUs being cut.
+ # 2. Cut empty OTUs
+ rmtaxa = taxa_names(newsub)[taxa_sums(newsub) <= 0]
+ if( length(rmtaxa) > 0 ){
+ if(verbose){
+ message(length(rmtaxa), "OTUs were removed because they are no longer \n",
+ "present in any sample after random subsampling\n")
+ message("...")
+ }
+ newsub = prune_taxa(setdiff(taxa_names(newsub), rmtaxa), newsub)
+ }
+ }
+ # If the OTU table was transposed before rarefaction, transpose it
+ # back to the way it was in the original physeq object.
+ if(!taxa_are_rows(physeq)){newsub <- t(newsub)}
+ return(newsub)
+}
+################################################################################
+# rarefaction subsample function, one sample
+################################################################################
+#' @keywords internal
+rarefaction_subsample <- function(x, sample.size, replace=FALSE){
+ # This is a test
+ # x = sample(10, 10)
+ # x = 1:10
+ # sample.size = 50
+ #system.time(obsvec <- foreach(OTUi=1:length(x), times=x, .combine=c) %do% {rep(OTUi, times)})
+ # data("GlobalPatterns")
+ # sample.size = sample_sums(GlobalPatterns)[which.min(sample_sums(GlobalPatterns))]
+ # x = get_taxa(GlobalPatterns, which.max(sample_sums(GlobalPatterns)))
+ # Create replacement species vector
+ rarvec <- numeric(length(x))
+ # Perform the sub-sampling. Suppress warnings due to old R compat issue.
+ # Also, make sure to avoid errors from x summing to zero,
+ # and there are no observations to sample.
+ # The initialization of rarvec above is already sufficient.
+ if(sum(x) <= 0){
+ # Protect against, and quickly return an empty vector,
+ # if x is already an empty count vector
+ return(rarvec)
+ }
+ if(replace){
+ # resample with replacement
+ suppressWarnings(subsample <- sample(1:length(x), sample.size, replace=TRUE, prob=x))
+ } else {
+ # resample without replacement
+ obsvec <- apply(data.frame(OTUi=1:length(x), times=x), 1, function(x){
+ rep_len(x["OTUi"], x["times"])
+ })
+ obsvec <- unlist(obsvec, use.names=FALSE)
+ # use `sample` for subsampling. Hope that obsvec doesn't overflow.
+ suppressWarnings(subsample <- sample(obsvec, sample.size, replace=FALSE))
+ }
+ # Tabulate the results (these are already named by the order in `x`)
+ sstab <- table(subsample)
+ # Assign the tabulated random subsample values to the species vector
+ rarvec[as(names(sstab), "integer")] <- sstab
+ # Return abundance vector. Let replacement happen elsewhere.
+ return(rarvec)
+}
+################################################################################
+#' Agglomerate closely-related taxa using single-linkage clustering.
+#'
+#' All tips of the tree separated by a cophenetic distance smaller than
+#' \code{h} will be agglomerated into one taxa using \code{\link{merge_taxa}}.
+#'
+#' Can be used to create a non-trivial OTU Table, if a phylogenetic tree is available.
+#'
+#' For now, a simple, ``greedy'', single-linkage clustering is used. In future releases
+#' it should be possible to specify different clustering approaches available in \code{R},
+#' in particular, complete-linkage clustering appears to be used more commonly for OTU
+#' clustering applications.
+#'
+#' @param physeq (Required). A \code{\link{phyloseq-class}},
+#' containing a phylogenetic tree.
+#' Alternatively, a phylogenetic tree \code{\link[ape]{phylo}} will also work.
+#'
+#' @param h (Optional). Numeric scalar of the height where the tree should be cut.
+#' This refers to the tree resulting from hierarchical clustering
+#' of \code{\link[ape]{cophenetic.phylo}(phy_tree(physeq))},
+#' not necessarily the original phylogenetic tree, \code{phy_tree(physeq)}.
+#' Default value is \code{0.2}.
+#' Note that this argument used to be named \code{speciationMinLength},
+#' before this function/method was rewritten.
+#'
+#' @param hcfun (Optional). A function.
+#' The (agglomerative, hierarchical) clustering function to use.
+#' Good examples are
+#' \code{\link[cluster]{agnes}} and \code{\link[stats]{hclust}}.
+#' The default is \code{\link[cluster]{agnes}}.
+#'
+#' @param ... (Optional). Additional named arguments to pass
+#' to \code{hcfun}.
+#'
+#' @return An instance of the \code{\link{phyloseq-class}}.
+#' Or alternatively, a \code{\link{phylo}} object if the
+#' \code{physeq} argument was just a tree.
+#' In the expected-use case, the number of OTUs will be fewer
+#' (see \code{\link{ntaxa}}),
+#' after merging OTUs that are related enough to be called
+#' the same OTU.
+#'
+#' @seealso
+#'
+#' \code{\link{merge_taxa}}
+#'
+#' \code{\link[cluster]{agnes}}
+#'
+#' \code{\link[stats]{hclust}}
+#'
+#' \code{\link[ape]{cophenetic.phylo}}
+#'
+#' \code{\link[ape]{phylo}}
+#'
+#' @importFrom cluster agnes
+#'
+#' @export
+#'
+#' @examples
+#' data("esophagus")
+#' # for speed
+#' esophagus = prune_taxa(taxa_names(esophagus)[1:25], esophagus)
+#' plot_tree(esophagus, label.tips="taxa_names", size="abundance", title="Before tip_glom()")
+#' plot_tree(tip_glom(esophagus, h=0.2), label.tips="taxa_names", size="abundance", title="After tip_glom()")
+tip_glom = function(physeq, h=0.2, hcfun=agnes, ...){
+ dd = as.dist(cophenetic.phylo(phy_tree(physeq)))
+ psclust = cutree(as.hclust(hcfun(dd, ...)), h=h)
+ cliques = levels(factor(psclust))[tapply(psclust, factor(psclust), function(x){length(x)>1})]
+ # For each clique, merge taxa in it...
+ for( i in cliques){
+ physeq = merge_taxa(physeq, eqtaxa=names(psclust)[psclust == i])
+ }
+ return(physeq)
+}
+################################################################################
+################################################################################
+#' Agglomerate taxa of the same type.
+#'
+#' This method merges species that have the same taxonomy at a certain
+#' taxaonomic rank.
+#' Its approach is analogous to \code{\link{tip_glom}}, but uses categorical data
+#' instead of a tree. In principal, other categorical data known for all taxa
+#' could also be used in place of taxonomy,
+#' but for the moment, this must be stored in the \code{taxonomyTable}
+#' of the data. Also, columns/ranks to the right of the rank chosen to use
+#' for agglomeration will be replaced with \code{NA},
+#' because they should be meaningless following agglomeration.
+#'
+#' @usage tax_glom(physeq, taxrank=rank_names(physeq)[1], NArm=TRUE, bad_empty=c(NA, "", " ", "\t"))
+#'
+#' @param physeq (Required). \code{\link{phyloseq-class}} or \code{\link{otu_table}}.
+#'
+#' @param taxrank A character string specifying the taxonomic level
+#' that you want to agglomerate over.
+#' Should be among the results of \code{rank_names(physeq)}.
+#' The default value is \code{rank_names(physeq)[1]},
+#' which may agglomerate too broadly for a given experiment.
+#' You are strongly encouraged to try different values for this argument.
+#'
+#' @param NArm (Optional). Logical, length equal to one. Default is \code{TRUE}.
+#' CAUTION. The decision to prune (or not) taxa for which you lack categorical
+#' data could have a large effect on downstream analysis. You may want to
+#' re-compute your analysis under both conditions, or at least think carefully
+#' about what the effect might be and the reasons explaining the absence of
+#' information for certain taxa. In the case of taxonomy, it is often a result
+#' of imprecision in taxonomic designation based on short phylogenetic sequences
+#' and a patchy system of nomenclature. If this seems to be an issue for your
+#' analysis, think about also trying the nomenclature-agnostic \code{\link{tip_glom}}
+#' method if you have a phylogenetic tree available.
+#'
+#' @param bad_empty (Optional). Character vector. Default: \code{c(NA, "", " ", "\t")}.
+#' Defines the bad/empty values
+#' that should be ignored and/or considered unknown. They will be removed
+#' from the internal agglomeration vector derived from the argument to \code{tax},
+#' and therefore agglomeration will not combine taxa according to the presence
+#' of these values in \code{tax}. Furthermore, the corresponding taxa can be
+#' optionally pruned from the output if \code{NArm} is set to \code{TRUE}.
+#'
+#' @return A taxonomically-agglomerated, optionally-pruned, object with class matching
+#' the class of \code{physeq}.
+#'
+#' @seealso
+#' \code{\link{tip_glom}}
+#'
+#' \code{\link{prune_taxa}}
+#'
+#' \code{\link{merge_taxa}}
+#'
+#' @export
+#'
+#' @examples
+#' # data(GlobalPatterns)
+#' # ## print the available taxonomic ranks
+#' # colnames(tax_table(GlobalPatterns))
+#' # ## agglomerate at the Family taxonomic rank
+#' # (x1 <- tax_glom(GlobalPatterns, taxrank="Family") )
+#' # ## How many taxa before/after agglomeration?
+#' # ntaxa(GlobalPatterns); ntaxa(x1)
+#' # ## Look at enterotype dataset...
+#' # data(enterotype)
+#' # ## print the available taxonomic ranks. Shows only 1 rank available, not useful for tax_glom
+#' # colnames(tax_table(enterotype))
+tax_glom <- function(physeq, taxrank=rank_names(physeq)[1],
+ NArm=TRUE, bad_empty=c(NA, "", " ", "\t")){
+
+ # Error if tax_table slot is empty
+ if( is.null(access(physeq, "tax_table")) ){
+ stop("The tax_glom() function requires that physeq contain a taxonomyTable")
+ }
+
+ # Error if bad taxrank
+ if( !taxrank[1] %in% rank_names(physeq) ){
+ stop("Bad taxrank argument. Must be among the values of rank_names(physeq)")
+ }
+
+ # Make a vector from the taxonomic data.
+ CN <- which( rank_names(physeq) %in% taxrank[1] )
+ tax <- as(access(physeq, "tax_table"), "matrix")[, CN]
+
+ # if NArm is TRUE, remove the empty, white-space, NA values from
+ if( NArm ){
+ keep_species <- names(tax)[ !(tax %in% bad_empty) ]
+ physeq <- prune_taxa(keep_species, physeq)
+ }
+
+ # Concatenate data up to the taxrank column, use this for agglomeration
+ tax <- as(access(physeq, "tax_table"), "matrix")[, 1:CN, drop=FALSE]
+ tax <- apply(tax, 1, function(i){paste(i, sep=";_;", collapse=";_;")})
+
+ # Remove NAs and useless from the vector/factor for looping.
+ # This does not remove the taxa that have an unknown (NA)
+ # taxonomic designation at this particular taxonomic rank.
+ tax <- tax[ !(tax %in% bad_empty) ]
+
+ # Define the OTU cliques to loop through
+ spCliques <- tapply(names(tax), factor(tax), list)
+
+ # Successively merge taxa in physeq.
+ for( i in names(spCliques)){
+ physeq <- merge_taxa(physeq, spCliques[[i]])
+ }
+
+ # "Empty" the values to the right of the rank, using NA_character_.
+ if( CN < length(rank_names(physeq)) ){
+ badcolumns <- (CN+1):length(rank_names(physeq))
+ tax_table(physeq)[, badcolumns] <- NA_character_
+ }
+
+ # Return.
+ return(physeq)
+}
+################################################################################
+################################################################################
+#' Prune unwanted OTUs / taxa from a phylogenetic object.
+#'
+#' An S4 Generic method for removing (pruning) unwanted OTUs/taxa from phylogenetic
+#' objects, including phylo-class trees, as well as native phyloseq package
+#' objects. This is particularly useful for pruning a phyloseq object that has
+#' more than one component that describes OTUs.
+#' Credit: the \code{phylo}-class version is adapted from
+#' \href{http://cran.at.r-project.org/web/packages/picante/index.html}{prune.sample}.
+#'
+#' @usage prune_taxa(taxa, x)
+#'
+#' @param taxa (Required). A character vector of the taxa in object x that you want to
+#' keep -- OR alternatively -- a logical vector where the kept taxa are TRUE, and length
+#' is equal to the number of taxa in object x. If \code{taxa} is a named
+#' logical, the taxa retained are based on those names. Make sure they are
+#' compatible with the \code{taxa_names} of the object you are modifying (\code{x}).
+#'
+#' @param x (Required). A phylogenetic object, including \code{phylo} trees,
+#' as well as all phyloseq classes that represent taxa. If the function
+#' \code{\link{taxa_names}} returns a non-\code{NULL} value, then your object
+#' can be pruned by this function.
+#'
+#' @return The class of the object returned by \code{prune_taxa} matches
+#' the class of the argument, \code{x}.
+#'
+#' @seealso
+#'
+#' \code{\link{prune_samples}}
+#'
+#' \href{http://cran.at.r-project.org/web/packages/picante/index.html}{prune.sample}
+#'
+#' @rdname prune_taxa-methods
+#' @export
+#' @examples
+#' data("esophagus")
+#' esophagus
+#' plot(sort(taxa_sums(esophagus), TRUE), type="h", ylim=c(0, 50))
+#' x1 = prune_taxa(taxa_sums(esophagus) > 10, esophagus)
+#' x2 = prune_taxa(names(sort(taxa_sums(esophagus), TRUE))[1:9], esophagus)
+#' identical(x1, x2)
+setGeneric("prune_taxa", function(taxa, x) standardGeneric("prune_taxa"))
+#' @aliases prune_taxa,NULL,ANY-method
+#' @rdname prune_taxa-methods
+setMethod("prune_taxa", signature("NULL", "ANY"), function(taxa, x){
+ return(x)
+})
+# Any prune_taxa call w/ signature starting with a logical
+# converts the logical to a character vector, and then dispatches
+# to more specific method.
+#' @aliases prune_taxa,logical,ANY-method
+#' @rdname prune_taxa-methods
+setMethod("prune_taxa", signature("logical", "ANY"), function(taxa, x){
+ # Check that logical has same length as ntaxa, stop if not.
+ if( !identical(length(taxa), ntaxa(x)) ){
+ stop("logical argument to taxa is wrong length. Should equal ntaxa(x)")
+ } else {
+ # Pass on to names-based prune_taxa method
+ return( prune_taxa(taxa_names(x)[taxa], x) )
+ }
+})
+#' @importFrom ape drop.tip
+#' @aliases prune_taxa,character,phylo-method
+#' @rdname prune_taxa-methods
+setMethod("prune_taxa", signature("character", "phylo"), function(taxa, x){
+ if( length(taxa) <= 1 ){
+ # Can't have a tree with 1 or fewer tips
+ warning("prune_taxa attempted to reduce tree to 1 or fewer tips.\n tree replaced with NULL.")
+ return(NULL)
+ } else if( setequal(taxa, taxa_names(x)) ){
+ return(x)
+ } else {
+ return( drop.tip(x, setdiff(taxa_names(x), taxa)) )
+ }
+})
+#' @aliases prune_taxa,character,otu_table-method
+#' @rdname prune_taxa-methods
+setMethod("prune_taxa", signature("character", "otu_table"), function(taxa, x){
+ if( setequal(taxa, taxa_names(x)) ){
+ return(x)
+ } else {
+ taxa = intersect( taxa, taxa_names(x) )
+ if( taxa_are_rows(x) ){
+ return(x[taxa, , drop=FALSE])
+ } else {
+ return(x[, taxa, drop=FALSE])
+ }
+ }
+})
+#' @aliases prune_taxa,character,sample_data-method
+#' @rdname prune_taxa-methods
+setMethod("prune_taxa", signature("character", "sample_data"), function(taxa, x){
+ return(x)
+})
+#' @aliases prune_taxa,character,phyloseq-method
+#' @rdname prune_taxa-methods
+setMethod("prune_taxa", signature("character", "phyloseq"), function(taxa, x){
+ # Re-define `taxa` as the intersection of OTU names for each component AND `taxa`
+ taxa = intersect(intersect_taxa(x), taxa)
+ # Now prune them all.
+ # All phyloseq objects have an otu_table slot, no need to test for existence.
+ x at otu_table = prune_taxa(taxa, otu_table(x))
+ # Test if slot is present. If so, perform the component prune.
+ if( !is.null(x at tax_table) ){
+ x at tax_table = prune_taxa(taxa, tax_table(x))
+ }
+ if( !is.null(x at phy_tree) ){
+ x at phy_tree = prune_taxa(taxa, phy_tree(x))
+ }
+ if( !is.null(x at refseq) ){
+ x at refseq = prune_taxa(taxa, refseq(x))
+ }
+ # Force index order after pruning to be the same,
+ # according to the same rules as in the constructor, phyloseq()
+ x = index_reorder(x, index_type="taxa")
+ return(x)
+})
+#' @aliases prune_taxa,character,taxonomyTable-method
+#' @rdname prune_taxa-methods
+setMethod("prune_taxa", signature("character", "taxonomyTable"), function(taxa, x){
+ if( setequal(taxa, taxa_names(x)) ){
+ return(x)
+ } else {
+ taxa = intersect( taxa, taxa_names(x) )
+ return( x[taxa, , drop=FALSE] )
+ }
+})
+#' @importClassesFrom Biostrings XStringSet
+#' @aliases prune_taxa,character,XStringSet-method
+#' @rdname prune_taxa-methods
+setMethod("prune_taxa", signature("character", "XStringSet"), function(taxa, x){
+ if( setequal(taxa, taxa_names(x)) ){
+ # Nothing to do, return x as-is.
+ return(x)
+ } else if( length(intersect(taxa, taxa_names(x))) == 0 ){
+ # Informative error if intersection is zero.
+ stop("prune_taxa,XStringSet: taxa and taxa_names(x) do not overlap.")
+ } else {
+ # Pop the OTUs that are not in `taxa`, without reordering.
+ return(x[-which(!taxa_names(x) %in% taxa)])
+ }
+})
+################################################################################
+################################################################################
+#' Define a subset of samples to keep in a phyloseq object.
+#'
+#' An S4 Generic method for pruning/filtering unwanted samples
+#' by defining those you want to keep.
+#'
+#' @usage prune_samples(samples, x)
+#'
+#' @param samples (Required). A character vector of the samples in object x that you want to
+#' keep -- OR alternatively -- a logical vector where the kept samples are TRUE, and length
+#' is equal to the number of samples in object x. If \code{samples} is a named
+#' logical, the samples retained is based on those names. Make sure they are
+#' compatible with the \code{sample_names} of the object you are modifying (\code{x}).
+#'
+#' @param x A phyloseq object.
+#'
+#' @return The class of the object returned by \code{prune_samples} matches
+#' the class of the phyloseq object, \code{x}.
+#'
+#' @seealso \code{\link{subset_samples}}
+#'
+#' @rdname prune_samples-methods
+#' @docType methods
+#' @export
+#' @examples
+#' data(GlobalPatterns)
+#' # Subset to just the Chlamydiae phylum.
+#' GP.chl <- subset_taxa(GlobalPatterns, Phylum=="Chlamydiae")
+#' # Remove the samples that have less than 20 total reads from Chlamydiae
+#' GP.chl <- prune_samples(sample_sums(GP.chl)>=20, GP.chl)
+#' # (p <- plot_tree(GP.chl, color="SampleType", shape="Family", label.tips="Genus", size="abundance"))
+setGeneric("prune_samples", function(samples, x) standardGeneric("prune_samples"))
+#' @aliases prune_samples,character,otu_table-method
+#' @rdname prune_samples-methods
+setMethod("prune_samples", signature("character", "otu_table"), function(samples, x){
+ if( setequal(samples, sample_names(x)) ){
+ # If the sets of `samples` and sample_names are the same, return as-is.
+ return(x)
+ } else {
+ samples = intersect(samples, sample_names(x))
+ if( taxa_are_rows(x) ){
+ return( x[, samples] )
+ } else {
+ return( x[samples, ] )
+ }
+ }
+})
+#' @aliases prune_samples,character,sample_data-method
+#' @rdname prune_samples-methods
+setMethod("prune_samples", signature("character", "sample_data"), function(samples, x){
+ if( setequal(samples, sample_names(x)) ){
+ # If the sets of `samples` and sample_names are the same, return as-is.
+ return(x)
+ } else {
+ samples = intersect(samples, sample_names(x))
+ return(x[samples, ])
+ }
+})
+#' @aliases prune_samples,character,phyloseq-method
+#' @rdname prune_samples-methods
+setMethod("prune_samples", signature("character", "phyloseq"), function(samples, x){
+ # Re-define `samples` as the intersection of samples names for each component AND `samples`
+ samples = intersect(intersect_samples(x), samples)
+ # Now prune each component.
+ # All phyloseq objects have an otu_table slot, no need to test for existence.
+ x at otu_table = prune_samples(samples, otu_table(x))
+ if( !is.null(x at sam_data) ){
+ # protect missing sample_data component. Don't need to prune if empty
+ x at sam_data = prune_samples(samples, sample_data(x))
+ }
+ # Force sample index order after pruning to be the same,
+ # according to the same rules as in the constructor, phyloseq()
+ x = index_reorder(x, index_type="samples")
+ return(x)
+})
+# A logical should specify the samples to keep, or not. Have same length as nsamples(x)
+#' @aliases prune_samples,logical,ANY-method
+#' @rdname prune_samples-methods
+setMethod("prune_samples", signature("logical", "ANY"), function(samples, x){
+ # Check that logical has same length as nsamples, stop if not.
+ if( !identical(length(samples), nsamples(x)) ){
+ stop("logical argument to samples is wrong length. Should equal nsamples(x)")
+ } else {
+ # Pass on to names-based prune_samples method
+ return( prune_samples(sample_names(x)[samples], x) )
+ }
+})
+################################################################################
+#' Thresholded rank transformation.
+#'
+#' The lowest \code{thresh} values in \code{x} all get the value 'thresh'.
+#'
+#' @usage threshrank(x, thresh, keep0s=FALSE, ...)
+#'
+#' @param x (Required). Numeric vector to transform.
+#' @param thresh A single numeric value giving the threshold.
+#' @param keep0s A logical determining whether 0's in \code{x} should remain
+#' a zero-value in the output. If FALSE, zeros are treated as any other value.
+#' @param ... Further arguments passes to the \code{\link{rank}} function.
+#'
+#' @return A ranked, (optionally) thresholded numeric vector with length equal to
+#' \code{x}. Default arguments to \code{rank} are used, unless provided as
+#' additional arguments.
+#'
+#' @seealso \code{\link{transform_sample_counts}}, \code{\link{rank}}, \code{\link{threshrankfun}}
+#' @export
+#' @examples #
+#' (a_vector <- sample(0:10, 100, TRUE))
+#' threshrank(a_vector, 5, keep0s=TRUE)
+#' data(GlobalPatterns)
+#' GP <- GlobalPatterns
+#' ## These three approaches result in identical otu_table
+#' (x1 <- transform_sample_counts( otu_table(GP), threshrankfun(500)) )
+#' (x2 <- otu_table(apply(otu_table(GP), 2, threshrankfun(500)), taxa_are_rows(GP)) )
+#' identical(x1, x2)
+#' (x3 <- otu_table(apply(otu_table(GP), 2, threshrank, thresh=500), taxa_are_rows(GP)) )
+#' identical(x1, x3)
+threshrank <- function(x, thresh, keep0s=FALSE, ...){
+ if( keep0s ){ index0 <- which(x == 0) }
+ x <- rank(x, ...)
+ thresh <- thresh[1]
+ x[x<thresh] <- thresh
+ if( keep0s ){ x[index0] <- 0 }
+ return(x)
+}
+####################################################################################
+#' A closure version of the \code{threshrank} function.
+#'
+#' Takes the same arguments as \code{\link{threshrank}}, except for \code{x},
+#' because the output is a single-argument function rather than a rank-transformed numeric.
+#' This is useful for higher-order functions that require a single-argument function as input,
+#' like \code{\link{transform_sample_counts}}.
+#'
+#' @usage threshrankfun(thresh, keep0s=FALSE, ...)
+#'
+#' @param thresh A single numeric value giving the threshold.
+#' @param keep0s A logical determining whether 0's in \code{x} should remain
+#' a zero-value in the output. If FALSE, zeros are treated as any other value.
+#' @param ... Further arguments passes to the \code{\link{rank}} function.
+#'
+#' @return A single-argument function with the options to \code{\link{threshrank}} set.
+#'
+#' @seealso \code{\link{transform_sample_counts}}, \code{\link{threshrankfun}},
+#' \code{\link{threshrank}}
+#' @export
+#' @examples
+#' data(esophagus)
+#' x1 = transform_sample_counts(esophagus, threshrankfun(50))
+#' otu_table(x1)
+#' x2 = transform_sample_counts(esophagus, rank)
+#' otu_table(x2)
+#' identical(x1, x2)
+threshrankfun <- function(thresh, keep0s=FALSE, ...){
+ function(x){
+ threshrank(x, thresh, keep0s=FALSE, ...)
+ }
+}
+################################################################################
+#' Transpose \code{\link{otu_table-class}} or \code{\link{phyloseq-class}}
+#'
+#' Extends the base transpose method, \code{\link[base]{t}}.
+#'
+#' @usage t(x)
+#'
+#' @param x An \code{otu_table} or \code{\link{phyloseq-class}}.
+#'
+#' @return The class of the object returned by \code{t} matches
+#' the class of the argument, \code{x}. The \code{otu_table} is
+#' transposed, and \code{\link{taxa_are_rows}} value is toggled.
+#'
+#' @name t
+#' @rdname transpose-methods
+#' @docType methods
+#' @export
+#' @examples
+#' data(GlobalPatterns)
+#' otu_table(GlobalPatterns)
+#' t( otu_table(GlobalPatterns) )
+setGeneric("t")
+#' @aliases t,otu_table-method
+#' @rdname transpose-methods
+setMethod("t", signature("otu_table"), function(x){
+ #new("otu_table", t(x at .Data), taxa_are_rows = (!taxa_are_rows(x)))
+ x <- otu_table( t(as(x, "matrix")), taxa_are_rows=(!taxa_are_rows(x)) )
+ return(x)
+})
+################################################################################
+#' @aliases t,phyloseq-method
+#' @rdname transpose-methods
+setMethod("t", signature("phyloseq"), function(x){
+ x at otu_table <- t( otu_table(x) )
+ return(x)
+})
+################################################################################
+#' Transform abundance data in an \code{otu_table}, sample-by-sample.
+#'
+#' This function transforms the sample counts of a taxa
+#' abundance matrix according to a user-provided function.
+#' The counts of each sample will be transformed individually. No sample-sample
+#' interaction/comparison is possible by this method.
+#'
+#' @usage transform_sample_counts(physeq, fun, ...)
+#'
+#' @param physeq (Required). \code{\link{phyloseq-class}} of \code{\link{otu_table-class}}.
+#'
+#' @param fun (Required). A single-argument function that will be applied
+#' to the abundance counts of each sample.
+#' Can be an anonymous \code{\link[base]{function}}.
+#'
+#' @param ... (Optional). Additional, optionally-named, arguments passed to
+#' \code{fun} during transformation of abundance data.
+#'
+#' @return A transformed \code{otu_table} -- or \code{phyloseq} object with its
+#' transformed \code{otu_table}.
+#' In general, trimming is not expected by this
+#' method, so it is suggested that the user provide only functions that return
+#' a full-length vector. Filtering/trimming can follow, for which the
+#' \code{\link{genefilter_sample}} and \code{\link{prune_taxa}} functions
+#' are suggested.
+#'
+#' @seealso \code{\link{threshrankfun}}, \code{\link{rank}}, \code{\link{log}}
+#'
+#' @docType methods
+#' @aliases transform_sample_counts transformSampleCounts
+#' @rdname transformcounts
+#' @export
+#'
+#' @examples #
+#' data(esophagus)
+#' x1 = transform_sample_counts(esophagus, threshrankfun(50))
+#' head(otu_table(x1), 10)
+#' x2 = transform_sample_counts(esophagus, rank)
+#' head(otu_table(x2), 10)
+#' identical(x1, x2)
+#' x3 = otu_table(esophagus) + 5
+#' x3 = transform_sample_counts(x3, log)
+#' head(otu_table(x3), 10)
+#' x4 = transform_sample_counts(esophagus, function(x) round(x^2.2, 0))
+#' head(otu_table(x4), 10)
+transform_sample_counts <- function(physeq, fun, ...){
+ # Test the user-provided function returns a vector of the same length as input.
+ if( !identical(length(fun(1:10)), 10L) ){stop("`fun` not valid function.")}
+ # Check orientation, transpose if-needed to make apply work properly.
+ if( taxa_are_rows(physeq) ){
+ newphyseq = apply(as(otu_table(physeq), "matrix"), 2, fun, ...)
+ if( identical(ntaxa(physeq), 1L) ){
+ # Fix the dropped index when only 1 OTU.
+ newphyseq <- matrix(newphyseq, 1L, nsamples(physeq), TRUE,
+ list(taxa_names(physeq), sample_names(physeq)))
+ }
+ } else {
+ newphyseq = apply(t(as(otu_table(physeq), "matrix")), 2, fun, ...)
+ if( identical(ntaxa(physeq), 1L) ){
+ # Fix the dropped index when only 1 OTU.
+ newphyseq <- matrix(newphyseq, 1L, nsamples(physeq), TRUE,
+ list(taxa_names(physeq), sample_names(physeq)))
+ }
+ newphyseq = t(newphyseq)
+ }
+ # Check that original and new dimensions agree. Error if not.
+ if( !identical(dim(newphyseq), dim(otu_table(physeq))) ){
+ stop("Dimensions of OTU table change after apply-ing function. \n",
+ " Please check both function and table")
+ }
+ otu_table(physeq) <- otu_table(newphyseq, taxa_are_rows=taxa_are_rows(physeq))
+ return(physeq)
+}
+####################################################################################
+#' @rdname transformcounts
+#' @export
+transformSampleCounts <- transform_sample_counts
+####################################################################################
+############################################################
+#' Filter OTUs with arbitrary function, sample-wise.
+#'
+#' A general OTU trimming function for selecting OTUs that satisfy
+#' some criteria within the distribution of each sample, and then
+#' also an additional criteria for number of samples that must pass.
+#' This is a genefilter-like function that only considers sample-wise
+#' criteria. The number of acceptable samples is used
+#' as the final criteria (set by the argument \code{A})
+#' to determine whether or not the taxa should
+#' be retained (\code{TRUE}) or not (\code{FALSE}). Just like with genefilter, a
+#' logical having length equal to nrow()/\code{\link{ntaxa}} is returned, indicating which
+#' should be kept. This output can be provided
+#' directly to OTU trimming function, \code{\link{prune_taxa}}.
+#' By contrast, \code{\link[genefilter]{genefilter}},
+#' of the genefilter package in Bioconductor,
+#' works only on the rows of a matrix. Note that, because \code{\link{otu_table-class}}
+#' inherits directly from the \code{\link{matrix-class}}, an unmodified
+#' otu_table can be provided to \code{genefilter}, but be mindful of the orientation
+#' of the otu_table (use \code{\link{taxa_are_rows}}),
+#' and transpose (\code{\link[phyloseq]{t}}) if needed.
+#'
+#' @usage genefilter_sample(X, flist, A=1)
+#'
+#' @param X The object that needs trimming. Can be matrix, otu_table, or higher-
+#' order phyloseq classes that contain an otu_table.
+#'
+#' @param flist An enclosure object, typically created with \code{\link{filterfun_sample}}
+#'
+#' @param A An integer. The number of samples in which a taxa / OTUs passed the filter
+#' for it to be labeled TRUE in the output logical vector.
+#'
+#' @return A logical vector with names equal to taxa_names (or rownames, if matrix).
+#'
+#' @seealso \code{\link[genefilter]{genefilter}}, \code{\link{filterfun_sample}},
+#' \code{\link[phyloseq]{t}},
+#' \code{\link{prune_taxa}}
+#' @keywords agglomerate OTU cluster tree
+#'
+#' @rdname genefilter_sample-methods
+#' @docType methods
+#' @export
+#'
+#' @examples #
+#' ## testOTU <- otu_table(matrix(sample(1:50, 25, replace=TRUE), 5, 5), taxa_are_rows=FALSE)
+#' ## f1 <- filterfun_sample(topk(2))
+#' ## wh1 <- genefilter_sample(testOTU, f1, A=2)
+#' ## wh2 <- c(TRUE, TRUE, TRUE, FALSE, FALSE)
+#' ## prune_taxa(wh1, testOTU)
+#' ## prune_taxa(wh2, testOTU)
+#' ##
+#' ## tax_table1 <- tax_table(matrix("abc", 5, 5))
+#' ## prune_taxa(wh1, tax_table1)
+#' ## prune_taxa(wh2, tax_table1)
+setGeneric("genefilter_sample", function(X, flist, A=1) standardGeneric("genefilter_sample"))
+#' @rdname genefilter_sample-methods
+#' @aliases genefilter_sample,matrix-method
+setMethod("genefilter_sample", signature("matrix"), function(X, flist, A=1){
+ TFmat = apply(X, 2, flist)
+ apply(TFmat, 1, function(x, A){sum(x) >= A}, A)
+})
+#' @rdname genefilter_sample-methods
+#' @aliases genefilter_sample,otu_table-method
+setMethod("genefilter_sample", signature("otu_table"), function(X, flist, A=1){
+ if( taxa_are_rows(X) ){
+ genefilter_sample( as(X, "matrix"), flist, A)
+ } else {
+ genefilter_sample( t(as(X, "matrix")), flist, A)
+ }
+})
+#' @rdname genefilter_sample-methods
+#' @aliases genefilter_sample,phyloseq-method
+setMethod("genefilter_sample", signature("phyloseq"), function(X, flist, A=1){
+ genefilter_sample(otu_table(X), flist, A)
+})
+################################################################################
+#' A sample-wise filter function builder
+#' analogous to \code{\link[genefilter]{filterfun}}.
+#'
+#' See the \code{\link[genefilter]{filterfun}}, from the Bioconductor repository,
+#' for a taxa-/gene-wise filter (and further examples).
+#'
+#' @usage filterfun_sample(...)
+#'
+#' @param ... A comma-separated list of functions.
+#'
+#' @return An enclosure (function) that itself will return a logical vector,
+#' according to the
+#' functions provided in the argument list, evaluated in order. The output of
+#' filterfun_sample is appropriate for the `flist' argument to the
+#' genefilter_sample method.
+#'
+#' @export
+#' @seealso \code{\link[genefilter]{filterfun}}, \code{\link{genefilter_sample}}
+#' @examples
+#' # Use simulated abundance matrix
+#' set.seed(711)
+#' testOTU <- otu_table(matrix(sample(1:50, 25, replace=TRUE), 5, 5), taxa_are_rows=FALSE)
+#' f1 <- filterfun_sample(topk(2))
+#' wh1 <- genefilter_sample(testOTU, f1, A=2)
+#' wh2 <- c(TRUE, TRUE, TRUE, FALSE, FALSE)
+#' prune_taxa(wh1, testOTU)
+#' prune_taxa(wh2, testOTU)
+filterfun_sample = function(...){
+ flist <- list(...)
+ if( length(flist) == 1 && is.list(flist[[1]])) { flist <- flist[[1]] }
+ f = function(x){
+ # initialize fval (a logical vector)
+ fun = flist[[1]]
+ fval = fun(x)
+ # check the remaining functions. Compare & logic, element-wise, each loop.
+ for(fun in flist[-1]){
+ fval = fval & fun(x)
+ }
+ return(fval)
+ }
+ class(f) <- "filterfun"
+ return(f)
+}
+################################################################################
+#' Filter taxa based on across-sample OTU abundance criteria
+#'
+#' This function is directly analogous to the
+#' \code{\link[genefilter]{genefilter}} function for microarray filtering,
+#' but is used for filtering OTUs from phyloseq objects.
+#' It applies an arbitrary set of functions ---
+#' as a function list, for instance, created by \code{\link[genefilter]{filterfun}} ---
+#' as across-sample criteria, one OTU at a time.
+#' It takes as input a phyloseq object,
+#' and returns a logical vector
+#' indicating whether or not each OTU passed the criteria.
+#' Alternatively, if the \code{"prune"} option is set to \code{FALSE},
+#' it returns the already-trimmed version of the phyloseq object.
+#'
+#' @usage filter_taxa(physeq, flist, prune=FALSE)
+#'
+#' @param physeq (Required). A \code{\link{phyloseq-class}} object that you
+#' want to trim/filter.
+#'
+#' @param flist (Required). A function or list of functions that take a vector
+#' of abundance values and return a logical. Some canned useful function types
+#' are included in the \code{genefilter}-package.
+#'
+#' @param prune (Optional). A logical. Default \code{FALSE}. If \code{TRUE}, then
+#' the function returns the pruned \code{\link{phyloseq-class}} object, rather
+#' than the logical vector of taxa that passed the filter.
+#'
+#' @return A logical vector equal to the number of taxa in \code{physeq}.
+#' This can be provided directly to \code{\link{prune_taxa}} as first argument.
+#' Alternatively, if \code{prune==TRUE}, the pruned \code{\link{phyloseq-class}}
+#' object is returned instead.
+#'
+#' @export
+#' @seealso
+#' \code{\link[genefilter]{filterfun}},
+#' \code{\link{genefilter_sample}},
+#' \code{\link{filterfun_sample}}
+#'
+#' @examples
+#' data("enterotype")
+#' require("genefilter")
+#' flist <- filterfun(kOverA(5, 2e-05))
+#' ent.logi <- filter_taxa(enterotype, flist)
+#' ent.trim <- filter_taxa(enterotype, flist, TRUE)
+#' identical(ent.trim, prune_taxa(ent.logi, enterotype))
+#' identical(sum(ent.logi), ntaxa(ent.trim))
+#' filter_taxa(enterotype, flist, TRUE)
+filter_taxa <- function(physeq, flist, prune=FALSE){
+ # access OTU table
+ OTU <- access(physeq, "otu_table", TRUE)
+ # Enforce orientation (we are filtering taxa, not samples)
+ if(!taxa_are_rows(OTU)) {
+ OTU <- t(OTU)
+ }
+ # Coerce to vanilla matrix
+ OTU <- as(OTU, "matrix")
+ # Apply filtering function(s), get logical of length ntaxa(physeq)
+ ans <- apply(OTU, 1, flist)
+ # sanity check
+ if( ntaxa(physeq) != length(ans) ){
+ stop("Logic error in applying function(s). Logical result not same length as ntaxa(physeq)")
+ }
+ # Now return logical or pruned phyloseq-class instance.
+ if( prune ){
+ return( prune_taxa(ans, physeq) )
+ } else {
+ return( ans )
+ }
+}
+################################################################################
+#' Make filter fun. the most abundant \code{k} taxa
+#'
+#' @usage topk(k, na.rm=TRUE)
+#'
+#' @param k An integer, indicating how many of the most abundant taxa
+#' should be kept.
+#' @param na.rm A logical. Should \code{NA}s be removed. Default is \code{TRUE}.
+#'
+#' @return Returns a function (enclosure) that will return TRUE
+#' for each element in the most abundant k values.
+#'
+#' @seealso \code{\link{topk}}, \code{\link{topf}},
+#' \code{\link{topp}}, \code{\link{rm_outlierf}}
+#'
+#' @export
+#'
+#' @examples
+#' ## Use simulated abundance matrix
+#' set.seed(711)
+#' testOTU <- otu_table(matrix(sample(1:50, 25, replace=TRUE), 5, 5), taxa_are_rows=FALSE)
+#' f1 <- filterfun_sample(topk(2))
+#' wh1 <- genefilter_sample(testOTU, f1, A=2)
+#' wh2 <- c(TRUE, TRUE, TRUE, FALSE, FALSE)
+#' prune_taxa(wh1, testOTU)
+#' prune_taxa(wh2, testOTU)
+topk = function(k, na.rm=TRUE){
+ function(x){
+ if(na.rm){x = x[!is.na(x)]}
+ x >= sort(x, decreasing=TRUE)[k]
+ }
+}
+############################################################
+#' Make filter fun. that returns the most abundant \code{p} fraction of taxa
+#'
+#' @usage topp(p, na.rm=TRUE)
+#'
+#' @param p A numeric of length 1, indicating what fraction of the most abundant taxa
+#' should be kept.
+#' @param na.rm A logical. Should \code{NA}s be removed. Default is \code{TRUE}.
+#'
+#' @return A function (enclosure), suitable for \code{\link{filterfun_sample}},
+#' that will return \code{TRUE}
+#' for each element in the most abundant p fraction of taxa.
+#'
+#' @seealso \code{\link{topk}}, \code{\link{topf}},
+#' \code{\link{topp}}, \code{\link{rm_outlierf}}
+#'
+#' @export
+#'
+#' @examples
+#' ## Use simulated abundance matrix
+#' set.seed(711)
+#' testOTU <- otu_table(matrix(sample(1:50, 25, replace=TRUE), 5, 5), taxa_are_rows=FALSE)
+#' sample_sums(testOTU)
+#' f1 <- filterfun_sample(topp(0.2))
+#' (wh1 <- genefilter_sample(testOTU, f1, A=1))
+#' wh2 <- c(TRUE, TRUE, TRUE, FALSE, FALSE)
+#' prune_taxa(wh1, testOTU)
+#' prune_taxa(wh2, testOTU)
+topp <- function(p, na.rm=TRUE){
+ function(x){
+ if(na.rm){x = x[!is.na(x)]}
+ x >= sort(x, decreasing=TRUE)[ceiling(length(x)*p)]
+ }
+}
+################################################################################
+#' Make filter fun. that returns the top f fraction of taxa in a sample.
+#'
+#' As opposed to \code{\link{topp}}, which gives the
+#' most abundant p fraction of observed taxa (richness, instead of cumulative
+#' abundance. Said another way, topf ensures a certain
+#' fraction of the total sequences are retained, while topp ensures
+#' that a certain fraction of taxa/species/OTUs are retained.
+#'
+#' @usage topf(f, na.rm=TRUE)
+#' @param f Single numeric value between 0 and 1.
+#' @param na.rm Logical. Should we remove NA values. Default \code{TRUE}.
+#'
+#' @return A function (enclosure), suitable for \code{\link{filterfun_sample}},
+#' that will return \code{TRUE}
+#' for each element in the taxa comprising the most abundant f fraction of individuals.
+#'
+#' @seealso \code{\link{topk}}, \code{\link{topf}},
+#' \code{\link{topp}}, \code{\link{rm_outlierf}}
+#'
+#' @export
+#'
+#' @examples
+#' t1 <- 1:10; names(t1)<-paste("t", 1:10, sep="")
+#' topf(0.6)(t1)
+#' ## Use simulated abundance matrix
+#' set.seed(711)
+#' testOTU <- otu_table(matrix(sample(1:50, 25, replace=TRUE), 5, 5), taxa_are_rows=FALSE)
+#' f1 <- filterfun_sample(topf(0.4))
+#' (wh1 <- genefilter_sample(testOTU, f1, A=1))
+#' wh2 <- c(TRUE, TRUE, TRUE, FALSE, FALSE)
+#' prune_taxa(wh1, testOTU)
+#' prune_taxa(wh2, testOTU)
+topf <- function(f, na.rm=TRUE){
+ function(x){
+ if (na.rm){
+ x = x[!is.na(x)]
+ }
+ y <- sort(x, decreasing = TRUE)
+ y <- cumsum(y)/sum(x)
+ return( (y <= f)[names(x)] )
+ }
+}
+################################################################################
+#' Set to FALSE any outlier species greater than f fractional abundance.
+#'
+#' This is for removing overly-abundant outlier taxa, not for trimming low-abundance
+#' taxa.
+#'
+#' @usage rm_outlierf(f, na.rm=TRUE)
+#'
+#' @param f Single numeric value between 0 and 1. The maximum fractional abundance
+#' value that a taxa will be allowed to have in a sample without being marked
+#' for trimming.
+#'
+#' @param na.rm Logical. Should we remove NA values. Default \code{TRUE}.
+#'
+#' @return A function (enclosure), suitable for \code{\link{filterfun_sample}}.
+#'
+#' @seealso \code{\link{topk}}, \code{\link{topf}},
+#' \code{\link{topp}}, \code{\link{rm_outlierf}}
+#'
+#' @export
+#' @examples
+#' t1 <- 1:10; names(t1)<-paste("t", 1:10, sep="")
+#' rm_outlierf(0.15)(t1)
+#' ## Use simulated abundance matrix
+#' set.seed(711)
+#' testOTU <- otu_table(matrix(sample(1:50, 25, replace=TRUE), 5, 5), taxa_are_rows=FALSE)
+#' taxa_sums(testOTU)
+#' f1 <- filterfun_sample(rm_outlierf(0.1))
+#' (wh1 <- genefilter_sample(testOTU, f1, A=1))
+#' wh2 <- c(TRUE, TRUE, TRUE, FALSE, FALSE)
+#' prune_taxa(wh1, testOTU)
+#' prune_taxa(wh2, testOTU)
+rm_outlierf <- function(f, na.rm=TRUE){
+ function(x){
+ if(na.rm){
+ x = x[!is.na(x)]
+ }
+ y <- x / sum(x)
+ return( y < f )
+ }
+}
+################################################################################
diff --git a/R/validity-methods.R b/R/validity-methods.R
new file mode 100644
index 0000000..a29bdc9
--- /dev/null
+++ b/R/validity-methods.R
@@ -0,0 +1,117 @@
+################################################################################
+# Validity methods:
+#
+# These are delicate, because they are effectively at the S4 infrastructure
+# level, in between "new" and the constructor. Some of the issues that might
+# otherwise go here for a check are handled by the constructors. In many
+# cases it desirable to let the constructor handle this, because it allows
+# greater flexibility and transparency. These tests should be limited to
+# conditions that are not fixed automatically by the constructors, and/or
+# could not be because the deficiency/error is too fundamental. By design,
+# we expect the validity errors to cause a fault before (nearly) any action
+# by the constructor.
+#
+# This is a special case where the accessors are not-used, in favor of the
+# S4 @tags. E.g. object at otu_table instead of otu_table(object). This is to avoid
+# any complications with the accessors interacting with objects early on.
+# Perhaps this is a mistake, but its a very limited case and won't be difficult
+# to change.
+#
+# Also, for now these are not documented at all at the user-level,
+# and are not expected to ever
+# be at the "user-level", so formal documentation probably unnecessary. Lots
+# of comments throughout this code will need to compensate.
+################################################################################
+########################################
+# otu_table:
+# # # * all values must be numeric (otu_table()-constructor should probably round values by default))
+# # # * all values must be >= 0 (no negative abundances)
+########################################
+validotu_table <- function(object){
+ # Both dimensions must have non-zero length.
+ if( any(dim(object)==0) ){
+ return("\n OTU abundance data must have non-zero dimensions.")
+ }
+ # Verify that it is numeric matrix
+ if( !is.numeric(object at .Data[, 1]) ){
+ text = "\n Non-numeric matrix provided as OTU table.\n"
+ text = paste0(text, "Abundance is expected to be numeric.")
+ return(text)
+ }
+ return(TRUE)
+}
+## assign the function as the validity method for the otu_table class
+setValidity("otu_table", validotu_table)
+########################################
+########################################
+# sample_data:
+########################################
+validsample_data <- function(object){
+ if( any(dim(object)==0) ){
+ return("Sample Data must have non-zero dimensions.")
+ }
+ return(TRUE)
+}
+## assign the function as the validity method for the sample_data class
+setValidity("sample_data", validsample_data)
+########################################
+########################################
+# taxonomyTable:
+########################################
+# # # * all values must be a character
+# # # * at least some non-NULL (or equiv) values
+# taxonomyTable validity function
+########################################
+validTaxonomyTable <- function(object){
+ # Both dimensions must have non-zero length.
+ if( any(dim(object)==0) ){
+ return("\n Taxonomy Table must have non-zero dimensions.")
+ }
+ # Verify that it is character matrix
+ if( !is.character(object at .Data[, 1]) ){
+ text = "\n Non-character matrix provided as Taxonomy Table.\n"
+ text = paste0(text, "Taxonomy is expected to be characters.")
+ return(text)
+ }
+ return(TRUE)
+}
+## assign the function as the validity method for the sample_data class
+setValidity("taxonomyTable", validTaxonomyTable)
+########################################
+########################################
+# tree:
+########################################
+# # (Any rules about trees appropriate in this context?)
+
+########################################
+########################################
+# phyloseq-class:
+########################################
+# Because data-index complete-matching is checked/enforced by the phyloseq() constructor,
+# it should not be checked here, or the constructor will fail validity tests before
+# it gets the chance to groom the objects.
+# Instead, the validity test can check if there is any intersection of the species names
+# and/or sample names, prior to any attempt by the constructor to prune (which would end)
+# in a mysterious index error, anyway
+########################################
+validphyloseq <- function(object){
+ # There must be an otu_table
+ if( is.null(object at otu_table) ){
+ return("\n An otu_table is required for most analysis / graphics in the phyloseq-package")
+ }
+ # intersection of species-names must have non-zero length
+ if( length(intersect_taxa(object)) <= 0 ){
+ return(paste("\n Component taxa/OTU names do not match.\n",
+ " Taxa indices are critical to analysis.\n Try taxa_names()", sep=""))
+ }
+ # If there is sample data, check that sample-names overlap
+ if( !is.null(object at sam_data) ){
+ if( length(intersect(sample_names(object at sam_data), sample_names(object at otu_table))) <= 0 ){
+ return("\n Component sample names do not match.\n Try sample_names()")
+ }
+ }
+ return(TRUE)
+}
+## assign the function as the validity method for the otu_table class
+setValidity("phyloseq", validphyloseq)
+########################################
diff --git a/README.html b/README.html
new file mode 100644
index 0000000..7b7d2a5
--- /dev/null
+++ b/README.html
@@ -0,0 +1,102 @@
+<!DOCTYPE html>
+
+<html xmlns="http://www.w3.org/1999/xhtml">
+
+<head>
+
+<meta charset="utf-8">
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+<meta http-equiv="Content-Style-Type" content="text/css" />
+<meta name="generator" content="pandoc" />
+
+
+
+<title></title>
+
+<script src="data:application/x-javascript,%2F%2A%21%20jQuery%20v1%2E11%2E0%20%7C%20%28c%29%202005%2C%202014%20jQuery%20Foundation%2C%20Inc%2E%20%7C%20jquery%2Eorg%2Flicense%20%2A%2F%0A%21function%28a%2Cb%29%7B%22object%22%3D%3Dtypeof%20module%26%26%22object%22%3D%3Dtypeof%20module%2Eexports%3Fmodule%2Eexports%3Da%2Edocument%3Fb%28a%2C%210%29%3Afunction%28a%29%7Bif%28%21a%2Edocument%29throw%20new%20Error%28%22jQuery%20requires%20a%20window%20with%20a%20document%22%29%3Breturn%20b%28a%29% [...]
+<meta name="viewport" content="width=device-width, initial-scale=1.0" />
+<link href="data:text/css,%2F%2A%21%0A%20%2A%20Bootstrap%20v2%2E3%2E2%0A%20%2A%0A%20%2A%20Copyright%202013%20Twitter%2C%20Inc%0A%20%2A%20Licensed%20under%20the%20Apache%20License%20v2%2E0%0A%20%2A%20http%3A%2F%2Fwww%2Eapache%2Eorg%2Flicenses%2FLICENSE%2D2%2E0%0A%20%2A%0A%20%2A%20Designed%20and%20built%20with%20all%20the%20love%20in%20the%20world%20by%20%40mdo%20and%20%40fat%2E%0A%20%2A%2F%2Eclearfix%7B%2Azoom%3A1%7D%2Eclearfix%3Abefore%2C%2Eclearfix%3Aafter%7Bdisplay%3Atable%3Bline%2Dhei [...]
+<link href="data:text/css,%2F%2A%21%0A%20%2A%20Bootstrap%20Responsive%20v2%2E3%2E2%0A%20%2A%0A%20%2A%20Copyright%202013%20Twitter%2C%20Inc%0A%20%2A%20Licensed%20under%20the%20Apache%20License%20v2%2E0%0A%20%2A%20http%3A%2F%2Fwww%2Eapache%2Eorg%2Flicenses%2FLICENSE%2D2%2E0%0A%20%2A%0A%20%2A%20Designed%20and%20built%20with%20all%20the%20love%20in%20the%20world%20by%20%40mdo%20and%20%40fat%2E%0A%20%2A%2F%2Eclearfix%7B%2Azoom%3A1%7D%2Eclearfix%3Abefore%2C%2Eclearfix%3Aafter%7Bdisplay%3Atable [...]
+<script src="data:application/x-javascript,%2F%2A%21%0A%2A%20Bootstrap%2Ejs%20by%20%40fat%20%26%20%40mdo%0A%2A%20Copyright%202013%20Twitter%2C%20Inc%2E%0A%2A%20http%3A%2F%2Fwww%2Eapache%2Eorg%2Flicenses%2FLICENSE%2D2%2E0%2Etxt%0A%2A%2F%0A%21function%28e%29%7B%22use%20strict%22%3Be%28function%28%29%7Be%2Esupport%2Etransition%3Dfunction%28%29%7Bvar%20e%3Dfunction%28%29%7Bvar%20e%3Ddocument%2EcreateElement%28%22bootstrap%22%29%2Ct%3D%7BWebkitTransition%3A%22webkitTransitionEnd%22%2CMozTrans [...]
+
+<style type="text/css">code{white-space: pre;}</style>
+<link href="data:text/css,pre%20%2Eoperator%2C%0Apre%20%2Eparen%20%7B%0A%20color%3A%20rgb%28104%2C%20118%2C%20135%29%0A%7D%0A%0Apre%20%2Eliteral%20%7B%0A%20color%3A%20%23990073%0A%7D%0A%0Apre%20%2Enumber%20%7B%0A%20color%3A%20%23099%3B%0A%7D%0A%0Apre%20%2Ecomment%20%7B%0A%20color%3A%20%23998%3B%0A%20font%2Dstyle%3A%20italic%0A%7D%0A%0Apre%20%2Ekeyword%20%7B%0A%20color%3A%20%23900%3B%0A%20font%2Dweight%3A%20bold%0A%7D%0A%0Apre%20%2Eidentifier%20%7B%0A%20color%3A%20rgb%280%2C%200%2C%200%29 [...]
+<script src="data:application/x-javascript,%0Avar%20hljs%3Dnew%20function%28%29%7Bfunction%20m%28p%29%7Breturn%20p%2Ereplace%28%2F%26%2Fgm%2C%22%26amp%3B%22%29%2Ereplace%28%2F%3C%2Fgm%2C%22%26lt%3B%22%29%7Dfunction%20f%28r%2Cq%2Cp%29%7Breturn%20RegExp%28q%2C%22m%22%2B%28r%2EcI%3F%22i%22%3A%22%22%29%2B%28p%3F%22g%22%3A%22%22%29%29%7Dfunction%20b%28r%29%7Bfor%28var%20p%3D0%3Bp%3Cr%2EchildNodes%2Elength%3Bp%2B%2B%29%7Bvar%20q%3Dr%2EchildNodes%5Bp%5D%3Bif%28q%2EnodeName%3D%3D%22CODE%22%29%7B [...]
+<style type="text/css">
+ pre:not([class]) {
+ background-color: white;
+ }
+</style>
+<script type="text/javascript">
+if (window.hljs && document.readyState && document.readyState === "complete") {
+ window.setTimeout(function() {
+ hljs.initHighlighting();
+ }, 0);
+}
+</script>
+
+
+
+</head>
+
+<body>
+
+<style type="text/css">
+.main-container {
+ max-width: 940px;
+ margin-left: auto;
+ margin-right: auto;
+}
+</style>
+<div class="container-fluid main-container">
+
+
+
+
+<p><link href="data:text/css,body%7B%0A%20%20%20%20margin%3A%200%20auto%3B%0A%20%20%20%20font%2Dfamily%3A%20Georgia%2C%20Palatino%2C%20serif%3B%0A%20%20%20%20color%3A%20%23444444%3B%0A%20%20%20%20line%2Dheight%3A%201%3B%0A%20%20%20%20max%2Dwidth%3A%20960px%3B%0A%20%20%20%20padding%3A%2030px%3B%0A%7D%0Ah1%2C%20h2%2C%20h3%2C%20h4%20%7B%0A%20%20%20%20color%3A%20%23111111%3B%0A%20%20%20%20font%2Dweight%3A%20400%3B%0A%7D%0Ah1%2C%20h2%2C%20h3%2C%20h4%2C%20h5%2C%20p%20%7B%0A%20%20%20%20margin%2 [...]
+<div id="phyloseq" class="section level1">
+<h1><a href="http://joey711.github.com/phyloseq/">phyloseq</a></h1>
+<div id="article-on-improved-microbiome-analysis" class="section level2">
+<h2>Article on Improved Microbiome Analysis</h2>
+<p>McMurdie and Holmes (2014) <a href="http://dx.plos.org/10.1371/journal.pcbi.1003531">Waste Not, Want Not: Why Rarefying Microbiome Data is Statistically Inadmissible</a> PLoS Computational Biology 10(4): e1003531</p>
+<p>Presubmission versions ahead of acceptance (2013): <a href="http://arxiv.org/pdf/1310.0424v2.pdf">PDF version 2</a>, <a href="http://arxiv.org/pdf/1310.0424v1.pdf">PDF version 1</a></p>
+</div>
+<div id="latest-peer-reviewed-article-about-phyloseq" class="section level2">
+<h2>Latest peer-reviewed article about phyloseq</h2>
+<p><a href="http://dx.plos.org/10.1371/journal.pone.0061217">phyloseq: An R package for reproducible interactive analysis and graphics of microbiome census data</a> (2013) PLoS ONE 8(4):e61217</p>
+</div>
+<div id="interface-with-microbio.meqiime" class="section level2">
+<h2>Interface with microbio.me/qiime</h2>
+<p>See the <a href="http://joey711.github.io/phyloseq/download-microbio.me.html">microbio_me_qiime tutorial</a> for more details and examples downloading and importing into phyloseq/R directly from this public database.</p>
+</div>
+<div id="other-resources" class="section level2">
+<h2>Other resources</h2>
+<p>The phyloseq project also has a number of supporting online resources, most of which can by found at <a href="http://joey711.github.com/phyloseq/">the phyloseq home page</a>, or from the phyloseq stable release <a href="http://bioconductor.org/packages/release/bioc/html/phyloseq.html">page on Bioconductor</a>.</p>
+<p>To post feature requests or ask for help, try <a href="https://github.com/joey711/phyloseq/issues">the phyloseq Issue Tracker</a>.</p>
+</div>
+</div>
+
+
+</div>
+
+<script>
+
+// add bootstrap table styles to pandoc tables
+$(document).ready(function () {
+ $('tr.header').parent('thead').parent('table').addClass('table table-condensed');
+});
+
+</script>
+
+<!-- dynamically load mathjax for compatibility with --self-contained -->
+<script>
+ (function () {
+ var script = document.createElement("script");
+ script.type = "text/javascript";
+ script.src = "https://c328740.ssl.cf1.rackcdn.com/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
+ document.getElementsByTagName("head")[0].appendChild(script);
+ })();
+</script>
+
+</body>
+</html>
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..4eed595
--- /dev/null
+++ b/README.md
@@ -0,0 +1,54 @@
+<link href="http://joey711.github.com/phyloseq/markdown.css" rel="stylesheet"></link>
+
+# [phyloseq](http://joey711.github.com/phyloseq/)
+
+[![Travis-CI Build Status](https://travis-ci.org/joey711/phyloseq.svg?branch=master)](https://travis-ci.org/joey711/phyloseq)
+
+## Article on Improved Microbiome Analysis
+
+McMurdie and Holmes (2014)
+[Waste Not, Want Not: Why Rarefying Microbiome Data is Statistically Inadmissible](http://dx.plos.org/10.1371/journal.pcbi.1003531)
+*PLoS Computational Biology*
+10(4): e1003531
+
+Presubmission versions ahead of acceptance (2013):
+[PDF version 2](http://arxiv.org/pdf/1310.0424v2.pdf),
+[PDF version 1](http://arxiv.org/pdf/1310.0424v1.pdf)
+
+
+## Peer-reviewed articles about phyloseq
+
+McMurdie and Holmes (2014) [Shiny-phyloseq: Web Application for Interactive Microbiome Analysis with Provenance Tracking](http://bioinformatics.oxfordjournals.org/content/early/2014/10/02/bioinformatics.btu616).
+*Bioinformatics (Oxford, England)*
+31(2), 282–283.
+
+McMurdie and Holmes (2013)
+[phyloseq: An R package for reproducible interactive analysis and graphics of microbiome census data](http://dx.plos.org/10.1371/journal.pone.0061217)
+*PLoS ONE*
+8(4):e61217
+
+## Other resources
+
+The phyloseq project also has a number of supporting online resources,
+including (but probably not limited to)
+
+### [the phyloseq home page](http://joey711.github.com/phyloseq/)
+
+### [the phyloseq FAQ](https://www.bioconductor.org/packages/release/bioc/vignettes/phyloseq/inst/doc/phyloseq-FAQ.html)
+I recommend checking this page, and the issues tracker,
+before posting new issues.
+
+### [Bioconductor stable release](http://bioconductor.org/packages/release/bioc/html/phyloseq.html).
+
+### [the phyloseq Issue Tracker](https://github.com/joey711/phyloseq/issues)
+This is the recommended location to post
+
+(1) feature requests
+(2) bug reports
+(3) theoretical considerations
+(4) other issues, feedback
+(5) ask for help
+
+Search previous posts,
+and check [the phyloseq FAQ](https://www.bioconductor.org/packages/release/bioc/vignettes/phyloseq/inst/doc/phyloseq-FAQ.html)
+before posting a new issue.
diff --git a/TODO.txt b/TODO.txt
new file mode 100644
index 0000000..e702e2e
--- /dev/null
+++ b/TODO.txt
@@ -0,0 +1,5 @@
+Planned feature improvements are publicly catalogued at the main phyloseq development site on github; specifically on the "Issues" page for phyloseq:
+
+https://github.com/joey711/phyloseq/issues
+
+If the feature you are hoping for is not listed, you are welcome to add it as a feature request "issue" on this page. This request will be publicly available and listed on the page.
diff --git a/build/vignette.rds b/build/vignette.rds
new file mode 100644
index 0000000..2fa1f5b
Binary files /dev/null and b/build/vignette.rds differ
diff --git a/data/GlobalPatterns.RData b/data/GlobalPatterns.RData
new file mode 100644
index 0000000..a42938a
Binary files /dev/null and b/data/GlobalPatterns.RData differ
diff --git a/data/datalist b/data/datalist
new file mode 100644
index 0000000..1e982b4
--- /dev/null
+++ b/data/datalist
@@ -0,0 +1,4 @@
+GlobalPatterns
+enterotype
+esophagus
+soilrep
diff --git a/data/enterotype.RData b/data/enterotype.RData
new file mode 100644
index 0000000..1f7be0a
Binary files /dev/null and b/data/enterotype.RData differ
diff --git a/data/esophagus.RData b/data/esophagus.RData
new file mode 100644
index 0000000..9ae486a
Binary files /dev/null and b/data/esophagus.RData differ
diff --git a/data/soilrep.RData b/data/soilrep.RData
new file mode 100644
index 0000000..88522d0
Binary files /dev/null and b/data/soilrep.RData differ
diff --git a/debian/README.test b/debian/README.test
deleted file mode 100644
index 53fb4d7..0000000
--- a/debian/README.test
+++ /dev/null
@@ -1,8 +0,0 @@
-Notes on how this package can be tested.
-────────────────────────────────────────
-
-This package can be tested by running the provided test:
-
- sh ./run-unit-test
-
-in order to confirm its integrity.
diff --git a/debian/changelog b/debian/changelog
deleted file mode 100644
index 3798922..0000000
--- a/debian/changelog
+++ /dev/null
@@ -1,34 +0,0 @@
-r-bioc-phyloseq (1.19.1-2) unstable; urgency=medium
-
- * Fix versioned depends from r-cran-ade4
- Closes: #850767
-
- -- Andreas Tille <tille at debian.org> Tue, 10 Jan 2017 09:31:39 +0100
-
-r-bioc-phyloseq (1.19.1-1) unstable; urgency=medium
-
- * New upstream version
-
- -- Andreas Tille <tille at debian.org> Sun, 08 Jan 2017 08:28:07 +0100
-
-r-bioc-phyloseq (1.18.1-1) unstable; urgency=medium
-
- * New upstream version
- * debhelper 10
-
- -- Andreas Tille <tille at debian.org> Sun, 04 Dec 2016 20:59:27 +0100
-
-r-bioc-phyloseq (1.18.0-1) unstable; urgency=medium
-
- * New upstream version
- * Convert to dh-r
- * Generic BioConductor homepage
- * d/watch: version=4
-
- -- Andreas Tille <tille at debian.org> Thu, 17 Nov 2016 16:36:02 +0100
-
-r-bioc-phyloseq (1.16.2-1) unstable; urgency=low
-
- * Initial release (closes: #837600)
-
- -- Andreas Tille <tille at debian.org> Tue, 13 Sep 2016 07:39:43 +0200
diff --git a/debian/compat b/debian/compat
deleted file mode 100644
index f599e28..0000000
--- a/debian/compat
+++ /dev/null
@@ -1 +0,0 @@
-10
diff --git a/debian/control b/debian/control
deleted file mode 100644
index f9d2768..0000000
--- a/debian/control
+++ /dev/null
@@ -1,34 +0,0 @@
-Source: r-bioc-phyloseq
-Maintainer: Debian Med Packaging Team <debian-med-packaging at lists.alioth.debian.org>
-Uploaders: Andreas Tille <tille at debian.org>
-Section: gnu-r
-Priority: optional
-Build-Depends: debhelper (>= 10),
- dh-r,
- r-base-dev,
- r-bioc-biobase,
- r-bioc-biostrings,
- r-bioc-biomformat,
- r-bioc-multtest,
- r-cran-ade4,
- r-cran-ape,
- r-cran-data.table,
- r-cran-foreach,
- r-cran-ggplot2,
- r-cran-igraph,
- r-cran-vegan
-Standards-Version: 3.9.8
-Vcs-Browser: https://anonscm.debian.org/viewvc/debian-med/trunk/packages/R/r-bioc-phyloseq/trunk/
-Vcs-Svn: svn://anonscm.debian.org/debian-med/trunk/packages/R/r-bioc-phyloseq/trunk/
-Homepage: https://bioconductor.org/packages/phyloseq/
-
-Package: r-bioc-phyloseq
-Architecture: all
-Depends: ${R:Depends},
- ${misc:Depends}
-Recommends: ${R:Recommends}
-Suggests: ${R:Suggests}
-Description: GNU R handling and analysis of high-throughput microbiome census data
- The Bioconductor module phyloseq provides a set of classes and tools to
- facilitate the import, storage, analysis, and graphical display of
- microbiome census data.
diff --git a/debian/copyright b/debian/copyright
deleted file mode 100644
index b73fdcc..0000000
--- a/debian/copyright
+++ /dev/null
@@ -1,675 +0,0 @@
-Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
-Upstream-Name: phyloseq
-Upstream-Contact: Paul J. McMurdie <mcmurdie at stanford.edu>
-Source: https://bioconductor.org/packages/phyloseq/
-
-Files: *
-Copyright: 2013-2016 Paul J. McMurdie <mcmurdie at stanford.edu>
-License: AGPL-3
-
-Files: debian/*
-Copyright: 2016 Andreas Tille <tille at debian.org>
-License: AGPL-3
-
-License: AGPL-3
- GNU AFFERO GENERAL PUBLIC LICENSE
- Version 3, 19 November 2007
- .
- Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
- .
- Preamble
- .
- The GNU Affero General Public License is a free, copyleft license for
- software and other kinds of works, specifically designed to ensure
- cooperation with the community in the case of network server software.
- .
- The licenses for most software and other practical works are designed
- to take away your freedom to share and change the works. By contrast,
- our General Public Licenses are intended to guarantee your freedom to
- share and change all versions of a program--to make sure it remains free
- software for all its users.
- .
- When we speak of free software, we are referring to freedom, not
- price. Our General Public Licenses are designed to make sure that you
- have the freedom to distribute copies of free software (and charge for
- them if you wish), that you receive source code or can get it if you
- want it, that you can change the software or use pieces of it in new
- free programs, and that you know you can do these things.
- .
- Developers that use our General Public Licenses protect your rights
- with two steps: (1) assert copyright on the software, and (2) offer
- you this License which gives you legal permission to copy, distribute
- and/or modify the software.
- .
- A secondary benefit of defending all users' freedom is that
- improvements made in alternate versions of the program, if they
- receive widespread use, become available for other developers to
- incorporate. Many developers of free software are heartened and
- encouraged by the resulting cooperation. However, in the case of
- software used on network servers, this result may fail to come about.
- The GNU General Public License permits making a modified version and
- letting the public access it on a server without ever releasing its
- source code to the public.
- .
- The GNU Affero General Public License is designed specifically to
- ensure that, in such cases, the modified source code becomes available
- to the community. It requires the operator of a network server to
- provide the source code of the modified version running there to the
- users of that server. Therefore, public use of a modified version, on
- a publicly accessible server, gives the public access to the source
- code of the modified version.
- .
- An older license, called the Affero General Public License and
- published by Affero, was designed to accomplish similar goals. This is
- a different license, not a version of the Affero GPL, but Affero has
- released a new version of the Affero GPL which permits relicensing under
- this license.
- .
- The precise terms and conditions for copying, distribution and
- modification follow.
- .
- TERMS AND CONDITIONS
- .
- 0. Definitions.
- .
- "This License" refers to version 3 of the GNU Affero General Public License.
- .
- "Copyright" also means copyright-like laws that apply to other kinds of
- works, such as semiconductor masks.
- .
- "The Program" refers to any copyrightable work licensed under this
- License. Each licensee is addressed as "you". "Licensees" and
- "recipients" may be individuals or organizations.
- .
- To "modify" a work means to copy from or adapt all or part of the work
- in a fashion requiring copyright permission, other than the making of an
- exact copy. The resulting work is called a "modified version" of the
- earlier work or a work "based on" the earlier work.
- .
- A "covered work" means either the unmodified Program or a work based
- on the Program.
- .
- To "propagate" a work means to do anything with it that, without
- permission, would make you directly or secondarily liable for
- infringement under applicable copyright law, except executing it on a
- computer or modifying a private copy. Propagation includes copying,
- distribution (with or without modification), making available to the
- public, and in some countries other activities as well.
- .
- To "convey" a work means any kind of propagation that enables other
- parties to make or receive copies. Mere interaction with a user through
- a computer network, with no transfer of a copy, is not conveying.
- .
- An interactive user interface displays "Appropriate Legal Notices"
- to the extent that it includes a convenient and prominently visible
- feature that (1) displays an appropriate copyright notice, and (2)
- tells the user that there is no warranty for the work (except to the
- extent that warranties are provided), that licensees may convey the
- work under this License, and how to view a copy of this License. If
- the interface presents a list of user commands or options, such as a
- menu, a prominent item in the list meets this criterion.
- .
- 1. Source Code.
- .
- The "source code" for a work means the preferred form of the work
- for making modifications to it. "Object code" means any non-source
- form of a work.
- .
- A "Standard Interface" means an interface that either is an official
- standard defined by a recognized standards body, or, in the case of
- interfaces specified for a particular programming language, one that
- is widely used among developers working in that language.
- .
- The "System Libraries" of an executable work include anything, other
- than the work as a whole, that (a) is included in the normal form of
- packaging a Major Component, but which is not part of that Major
- Component, and (b) serves only to enable use of the work with that
- Major Component, or to implement a Standard Interface for which an
- implementation is available to the public in source code form. A
- "Major Component", in this context, means a major essential component
- (kernel, window system, and so on) of the specific operating system
- (if any) on which the executable work runs, or a compiler used to
- produce the work, or an object code interpreter used to run it.
- .
- The "Corresponding Source" for a work in object code form means all
- the source code needed to generate, install, and (for an executable
- work) run the object code and to modify the work, including scripts to
- control those activities. However, it does not include the work's
- System Libraries, or general-purpose tools or generally available free
- programs which are used unmodified in performing those activities but
- which are not part of the work. For example, Corresponding Source
- includes interface definition files associated with source files for
- the work, and the source code for shared libraries and dynamically
- linked subprograms that the work is specifically designed to require,
- such as by intimate data communication or control flow between those
- subprograms and other parts of the work.
- .
- The Corresponding Source need not include anything that users
- can regenerate automatically from other parts of the Corresponding
- Source.
- .
- The Corresponding Source for a work in source code form is that
- same work.
- .
- 2. Basic Permissions.
- .
- All rights granted under this License are granted for the term of
- copyright on the Program, and are irrevocable provided the stated
- conditions are met. This License explicitly affirms your unlimited
- permission to run the unmodified Program. The output from running a
- covered work is covered by this License only if the output, given its
- content, constitutes a covered work. This License acknowledges your
- rights of fair use or other equivalent, as provided by copyright law.
- .
- You may make, run and propagate covered works that you do not
- convey, without conditions so long as your license otherwise remains
- in force. You may convey covered works to others for the sole purpose
- of having them make modifications exclusively for you, or provide you
- with facilities for running those works, provided that you comply with
- the terms of this License in conveying all material for which you do
- not control copyright. Those thus making or running the covered works
- for you must do so exclusively on your behalf, under your direction
- and control, on terms that prohibit them from making any copies of
- your copyrighted material outside their relationship with you.
- .
- Conveying under any other circumstances is permitted solely under
- the conditions stated below. Sublicensing is not allowed; section 10
- makes it unnecessary.
- .
- 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
- .
- No covered work shall be deemed part of an effective technological
- measure under any applicable law fulfilling obligations under article
- 11 of the WIPO copyright treaty adopted on 20 December 1996, or
- similar laws prohibiting or restricting circumvention of such
- measures.
- .
- When you convey a covered work, you waive any legal power to forbid
- circumvention of technological measures to the extent such circumvention
- is effected by exercising rights under this License with respect to
- the covered work, and you disclaim any intention to limit operation or
- modification of the work as a means of enforcing, against the work's
- users, your or third parties' legal rights to forbid circumvention of
- technological measures.
- .
- 4. Conveying Verbatim Copies.
- .
- You may convey verbatim copies of the Program's source code as you
- receive it, in any medium, provided that you conspicuously and
- appropriately publish on each copy an appropriate copyright notice;
- keep intact all notices stating that this License and any
- non-permissive terms added in accord with section 7 apply to the code;
- keep intact all notices of the absence of any warranty; and give all
- recipients a copy of this License along with the Program.
- .
- You may charge any price or no price for each copy that you convey,
- and you may offer support or warranty protection for a fee.
- .
- 5. Conveying Modified Source Versions.
- .
- You may convey a work based on the Program, or the modifications to
- produce it from the Program, in the form of source code under the
- terms of section 4, provided that you also meet all of these conditions:
- .
- a) The work must carry prominent notices stating that you modified
- it, and giving a relevant date.
- .
- b) The work must carry prominent notices stating that it is
- released under this License and any conditions added under section
- 7. This requirement modifies the requirement in section 4 to
- "keep intact all notices".
- .
- c) You must license the entire work, as a whole, under this
- License to anyone who comes into possession of a copy. This
- License will therefore apply, along with any applicable section 7
- additional terms, to the whole of the work, and all its parts,
- regardless of how they are packaged. This License gives no
- permission to license the work in any other way, but it does not
- invalidate such permission if you have separately received it.
- .
- d) If the work has interactive user interfaces, each must display
- Appropriate Legal Notices; however, if the Program has interactive
- interfaces that do not display Appropriate Legal Notices, your
- work need not make them do so.
- .
- A compilation of a covered work with other separate and independent
- works, which are not by their nature extensions of the covered work,
- and which are not combined with it such as to form a larger program,
- in or on a volume of a storage or distribution medium, is called an
- "aggregate" if the compilation and its resulting copyright are not
- used to limit the access or legal rights of the compilation's users
- beyond what the individual works permit. Inclusion of a covered work
- in an aggregate does not cause this License to apply to the other
- parts of the aggregate.
- .
- 6. Conveying Non-Source Forms.
- .
- You may convey a covered work in object code form under the terms
- of sections 4 and 5, provided that you also convey the
- machine-readable Corresponding Source under the terms of this License,
- in one of these ways:
- .
- a) Convey the object code in, or embodied in, a physical product
- (including a physical distribution medium), accompanied by the
- Corresponding Source fixed on a durable physical medium
- customarily used for software interchange.
- .
- b) Convey the object code in, or embodied in, a physical product
- (including a physical distribution medium), accompanied by a
- written offer, valid for at least three years and valid for as
- long as you offer spare parts or customer support for that product
- model, to give anyone who possesses the object code either (1) a
- copy of the Corresponding Source for all the software in the
- product that is covered by this License, on a durable physical
- medium customarily used for software interchange, for a price no
- more than your reasonable cost of physically performing this
- conveying of source, or (2) access to copy the
- Corresponding Source from a network server at no charge.
- .
- c) Convey individual copies of the object code with a copy of the
- written offer to provide the Corresponding Source. This
- alternative is allowed only occasionally and noncommercially, and
- only if you received the object code with such an offer, in accord
- with subsection 6b.
- .
- d) Convey the object code by offering access from a designated
- place (gratis or for a charge), and offer equivalent access to the
- Corresponding Source in the same way through the same place at no
- further charge. You need not require recipients to copy the
- Corresponding Source along with the object code. If the place to
- copy the object code is a network server, the Corresponding Source
- may be on a different server (operated by you or a third party)
- that supports equivalent copying facilities, provided you maintain
- clear directions next to the object code saying where to find the
- Corresponding Source. Regardless of what server hosts the
- Corresponding Source, you remain obligated to ensure that it is
- available for as long as needed to satisfy these requirements.
- .
- e) Convey the object code using peer-to-peer transmission, provided
- you inform other peers where the object code and Corresponding
- Source of the work are being offered to the general public at no
- charge under subsection 6d.
- .
- A separable portion of the object code, whose source code is excluded
- from the Corresponding Source as a System Library, need not be
- included in conveying the object code work.
- .
- A "User Product" is either (1) a "consumer product", which means any
- tangible personal property which is normally used for personal, family,
- or household purposes, or (2) anything designed or sold for incorporation
- into a dwelling. In determining whether a product is a consumer product,
- doubtful cases shall be resolved in favor of coverage. For a particular
- product received by a particular user, "normally used" refers to a
- typical or common use of that class of product, regardless of the status
- of the particular user or of the way in which the particular user
- actually uses, or expects or is expected to use, the product. A product
- is a consumer product regardless of whether the product has substantial
- commercial, industrial or non-consumer uses, unless such uses represent
- the only significant mode of use of the product.
- .
- "Installation Information" for a User Product means any methods,
- procedures, authorization keys, or other information required to install
- and execute modified versions of a covered work in that User Product from
- a modified version of its Corresponding Source. The information must
- suffice to ensure that the continued functioning of the modified object
- code is in no case prevented or interfered with solely because
- modification has been made.
- .
- If you convey an object code work under this section in, or with, or
- specifically for use in, a User Product, and the conveying occurs as
- part of a transaction in which the right of possession and use of the
- User Product is transferred to the recipient in perpetuity or for a
- fixed term (regardless of how the transaction is characterized), the
- Corresponding Source conveyed under this section must be accompanied
- by the Installation Information. But this requirement does not apply
- if neither you nor any third party retains the ability to install
- modified object code on the User Product (for example, the work has
- been installed in ROM).
- .
- The requirement to provide Installation Information does not include a
- requirement to continue to provide support service, warranty, or updates
- for a work that has been modified or installed by the recipient, or for
- the User Product in which it has been modified or installed. Access to a
- network may be denied when the modification itself materially and
- adversely affects the operation of the network or violates the rules and
- protocols for communication across the network.
- .
- Corresponding Source conveyed, and Installation Information provided,
- in accord with this section must be in a format that is publicly
- documented (and with an implementation available to the public in
- source code form), and must require no special password or key for
- unpacking, reading or copying.
- .
- 7. Additional Terms.
- .
- "Additional permissions" are terms that supplement the terms of this
- License by making exceptions from one or more of its conditions.
- Additional permissions that are applicable to the entire Program shall
- be treated as though they were included in this License, to the extent
- that they are valid under applicable law. If additional permissions
- apply only to part of the Program, that part may be used separately
- under those permissions, but the entire Program remains governed by
- this License without regard to the additional permissions.
- .
- When you convey a copy of a covered work, you may at your option
- remove any additional permissions from that copy, or from any part of
- it. (Additional permissions may be written to require their own
- removal in certain cases when you modify the work.) You may place
- additional permissions on material, added by you to a covered work,
- for which you have or can give appropriate copyright permission.
- .
- Notwithstanding any other provision of this License, for material you
- add to a covered work, you may (if authorized by the copyright holders of
- that material) supplement the terms of this License with terms:
- .
- a) Disclaiming warranty or limiting liability differently from the
- terms of sections 15 and 16 of this License; or
- .
- b) Requiring preservation of specified reasonable legal notices or
- author attributions in that material or in the Appropriate Legal
- Notices displayed by works containing it; or
- .
- c) Prohibiting misrepresentation of the origin of that material, or
- requiring that modified versions of such material be marked in
- reasonable ways as different from the original version; or
- .
- d) Limiting the use for publicity purposes of names of licensors or
- authors of the material; or
- .
- e) Declining to grant rights under trademark law for use of some
- trade names, trademarks, or service marks; or
- .
- f) Requiring indemnification of licensors and authors of that
- material by anyone who conveys the material (or modified versions of
- it) with contractual assumptions of liability to the recipient, for
- any liability that these contractual assumptions directly impose on
- those licensors and authors.
- .
- All other non-permissive additional terms are considered "further
- restrictions" within the meaning of section 10. If the Program as you
- received it, or any part of it, contains a notice stating that it is
- governed by this License along with a term that is a further
- restriction, you may remove that term. If a license document contains
- a further restriction but permits relicensing or conveying under this
- License, you may add to a covered work material governed by the terms
- of that license document, provided that the further restriction does
- not survive such relicensing or conveying.
- .
- If you add terms to a covered work in accord with this section, you
- must place, in the relevant source files, a statement of the
- additional terms that apply to those files, or a notice indicating
- where to find the applicable terms.
- .
- Additional terms, permissive or non-permissive, may be stated in the
- form of a separately written license, or stated as exceptions;
- the above requirements apply either way.
- .
- 8. Termination.
- .
- You may not propagate or modify a covered work except as expressly
- provided under this License. Any attempt otherwise to propagate or
- modify it is void, and will automatically terminate your rights under
- this License (including any patent licenses granted under the third
- paragraph of section 11).
- .
- However, if you cease all violation of this License, then your
- license from a particular copyright holder is reinstated (a)
- provisionally, unless and until the copyright holder explicitly and
- finally terminates your license, and (b) permanently, if the copyright
- holder fails to notify you of the violation by some reasonable means
- prior to 60 days after the cessation.
- .
- Moreover, your license from a particular copyright holder is
- reinstated permanently if the copyright holder notifies you of the
- violation by some reasonable means, this is the first time you have
- received notice of violation of this License (for any work) from that
- copyright holder, and you cure the violation prior to 30 days after
- your receipt of the notice.
- .
- Termination of your rights under this section does not terminate the
- licenses of parties who have received copies or rights from you under
- this License. If your rights have been terminated and not permanently
- reinstated, you do not qualify to receive new licenses for the same
- material under section 10.
- .
- 9. Acceptance Not Required for Having Copies.
- .
- You are not required to accept this License in order to receive or
- run a copy of the Program. Ancillary propagation of a covered work
- occurring solely as a consequence of using peer-to-peer transmission
- to receive a copy likewise does not require acceptance. However,
- nothing other than this License grants you permission to propagate or
- modify any covered work. These actions infringe copyright if you do
- not accept this License. Therefore, by modifying or propagating a
- covered work, you indicate your acceptance of this License to do so.
- .
- 10. Automatic Licensing of Downstream Recipients.
- .
- Each time you convey a covered work, the recipient automatically
- receives a license from the original licensors, to run, modify and
- propagate that work, subject to this License. You are not responsible
- for enforcing compliance by third parties with this License.
- .
- An "entity transaction" is a transaction transferring control of an
- organization, or substantially all assets of one, or subdividing an
- organization, or merging organizations. If propagation of a covered
- work results from an entity transaction, each party to that
- transaction who receives a copy of the work also receives whatever
- licenses to the work the party's predecessor in interest had or could
- give under the previous paragraph, plus a right to possession of the
- Corresponding Source of the work from the predecessor in interest, if
- the predecessor has it or can get it with reasonable efforts.
- .
- You may not impose any further restrictions on the exercise of the
- rights granted or affirmed under this License. For example, you may
- not impose a license fee, royalty, or other charge for exercise of
- rights granted under this License, and you may not initiate litigation
- (including a cross-claim or counterclaim in a lawsuit) alleging that
- any patent claim is infringed by making, using, selling, offering for
- sale, or importing the Program or any portion of it.
- .
- 11. Patents.
- .
- A "contributor" is a copyright holder who authorizes use under this
- License of the Program or a work on which the Program is based. The
- work thus licensed is called the contributor's "contributor version".
- .
- A contributor's "essential patent claims" are all patent claims
- owned or controlled by the contributor, whether already acquired or
- hereafter acquired, that would be infringed by some manner, permitted
- by this License, of making, using, or selling its contributor version,
- but do not include claims that would be infringed only as a
- consequence of further modification of the contributor version. For
- purposes of this definition, "control" includes the right to grant
- patent sublicenses in a manner consistent with the requirements of
- this License.
- .
- Each contributor grants you a non-exclusive, worldwide, royalty-free
- patent license under the contributor's essential patent claims, to
- make, use, sell, offer for sale, import and otherwise run, modify and
- propagate the contents of its contributor version.
- .
- In the following three paragraphs, a "patent license" is any express
- agreement or commitment, however denominated, not to enforce a patent
- (such as an express permission to practice a patent or covenant not to
- sue for patent infringement). To "grant" such a patent license to a
- party means to make such an agreement or commitment not to enforce a
- patent against the party.
- .
- If you convey a covered work, knowingly relying on a patent license,
- and the Corresponding Source of the work is not available for anyone
- to copy, free of charge and under the terms of this License, through a
- publicly available network server or other readily accessible means,
- then you must either (1) cause the Corresponding Source to be so
- available, or (2) arrange to deprive yourself of the benefit of the
- patent license for this particular work, or (3) arrange, in a manner
- consistent with the requirements of this License, to extend the patent
- license to downstream recipients. "Knowingly relying" means you have
- actual knowledge that, but for the patent license, your conveying the
- covered work in a country, or your recipient's use of the covered work
- in a country, would infringe one or more identifiable patents in that
- country that you have reason to believe are valid.
- .
- If, pursuant to or in connection with a single transaction or
- arrangement, you convey, or propagate by procuring conveyance of, a
- covered work, and grant a patent license to some of the parties
- receiving the covered work authorizing them to use, propagate, modify
- or convey a specific copy of the covered work, then the patent license
- you grant is automatically extended to all recipients of the covered
- work and works based on it.
- .
- A patent license is "discriminatory" if it does not include within
- the scope of its coverage, prohibits the exercise of, or is
- conditioned on the non-exercise of one or more of the rights that are
- specifically granted under this License. You may not convey a covered
- work if you are a party to an arrangement with a third party that is
- in the business of distributing software, under which you make payment
- to the third party based on the extent of your activity of conveying
- the work, and under which the third party grants, to any of the
- parties who would receive the covered work from you, a discriminatory
- patent license (a) in connection with copies of the covered work
- conveyed by you (or copies made from those copies), or (b) primarily
- for and in connection with specific products or compilations that
- contain the covered work, unless you entered into that arrangement,
- or that patent license was granted, prior to 28 March 2007.
- .
- Nothing in this License shall be construed as excluding or limiting
- any implied license or other defenses to infringement that may
- otherwise be available to you under applicable patent law.
- .
- 12. No Surrender of Others' Freedom.
- .
- If conditions are imposed on you (whether by court order, agreement or
- otherwise) that contradict the conditions of this License, they do not
- excuse you from the conditions of this License. If you cannot convey a
- covered work so as to satisfy simultaneously your obligations under this
- License and any other pertinent obligations, then as a consequence you may
- not convey it at all. For example, if you agree to terms that obligate you
- to collect a royalty for further conveying from those to whom you convey
- the Program, the only way you could satisfy both those terms and this
- License would be to refrain entirely from conveying the Program.
- .
- 13. Remote Network Interaction; Use with the GNU General Public License.
- .
- Notwithstanding any other provision of this License, if you modify the
- Program, your modified version must prominently offer all users
- interacting with it remotely through a computer network (if your version
- supports such interaction) an opportunity to receive the Corresponding
- Source of your version by providing access to the Corresponding Source
- from a network server at no charge, through some standard or customary
- means of facilitating copying of software. This Corresponding Source
- shall include the Corresponding Source for any work covered by version 3
- of the GNU General Public License that is incorporated pursuant to the
- following paragraph.
- .
- Notwithstanding any other provision of this License, you have
- permission to link or combine any covered work with a work licensed
- under version 3 of the GNU General Public License into a single
- combined work, and to convey the resulting work. The terms of this
- License will continue to apply to the part which is the covered work,
- but the work with which it is combined will remain governed by version
- 3 of the GNU General Public License.
- .
- 14. Revised Versions of this License.
- .
- The Free Software Foundation may publish revised and/or new versions of
- the GNU Affero General Public License from time to time. Such new versions
- will be similar in spirit to the present version, but may differ in detail to
- address new problems or concerns.
- .
- Each version is given a distinguishing version number. If the
- Program specifies that a certain numbered version of the GNU Affero General
- Public License "or any later version" applies to it, you have the
- option of following the terms and conditions either of that numbered
- version or of any later version published by the Free Software
- Foundation. If the Program does not specify a version number of the
- GNU Affero General Public License, you may choose any version ever published
- by the Free Software Foundation.
- .
- If the Program specifies that a proxy can decide which future
- versions of the GNU Affero General Public License can be used, that proxy's
- public statement of acceptance of a version permanently authorizes you
- to choose that version for the Program.
- .
- Later license versions may give you additional or different
- permissions. However, no additional obligations are imposed on any
- author or copyright holder as a result of your choosing to follow a
- later version.
- .
- 15. Disclaimer of Warranty.
- .
- THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
- APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
- HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
- OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
- THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
- IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
- ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
- .
- 16. Limitation of Liability.
- .
- IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
- WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
- THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
- GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
- USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
- DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
- PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
- EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
- SUCH DAMAGES.
- .
- 17. Interpretation of Sections 15 and 16.
- .
- If the disclaimer of warranty and limitation of liability provided
- above cannot be given local legal effect according to their terms,
- reviewing courts shall apply local law that most closely approximates
- an absolute waiver of all civil liability in connection with the
- Program, unless a warranty or assumption of liability accompanies a
- copy of the Program in return for a fee.
- .
- END OF TERMS AND CONDITIONS
- .
- How to Apply These Terms to Your New Programs
- .
- If you develop a new program, and you want it to be of the greatest
- possible use to the public, the best way to achieve this is to make it
- free software which everyone can redistribute and change under these terms.
- .
- To do so, attach the following notices to the program. It is safest
- to attach them to the start of each source file to most effectively
- state the exclusion of warranty; and each file should have at least
- the "copyright" line and a pointer to where the full notice is found.
- .
- <one line to give the program's name and a brief idea of what it does.>
- Copyright (C) <year> <name of author>
- .
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- .
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU Affero General Public License for more details.
- .
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
- .
- Also add information on how to contact you by electronic and paper mail.
- .
- If your software can interact with users remotely through a computer
- network, you should also make sure that it provides a way for users to
- get its source. For example, if your program is a web application, its
- interface could display a "Source" link that leads users to an archive
- of the code. There are many ways you could offer source, and different
- solutions will be better for different programs; see section 13 for the
- specific requirements.
- .
- You should also get your employer (if you work as a programmer) or school,
- if any, to sign a "copyright disclaimer" for the program, if necessary.
- For more information on this, and how to apply and follow the GNU AGPL, see
- <http://www.gnu.org/licenses/>.
diff --git a/debian/docs b/debian/docs
deleted file mode 100644
index 960011c..0000000
--- a/debian/docs
+++ /dev/null
@@ -1,3 +0,0 @@
-tests
-debian/README.test
-debian/tests/run-unit-test
diff --git a/debian/patches/fix_r-cran-ade4_versioning.patch b/debian/patches/fix_r-cran-ade4_versioning.patch
deleted file mode 100644
index 7149493..0000000
--- a/debian/patches/fix_r-cran-ade4_versioning.patch
+++ /dev/null
@@ -1,21 +0,0 @@
-Author: Andreas Tille <tille at debian.org>
-Last-Update: Tue, 10 Jan 2017 08:53:44 +0100
-Description: Phyloseq upstream does not distinguish between '.' and '-' in
- version specification. Due to automatic calculation of dependencies via
- ${R:Depends} the wrongly versioned relation to r-cran-ade4 ends up in the
- control information of the package. This patch fixes the relation.
- .
- See
- https://lists.alioth.debian.org/pipermail/debian-med-packaging/2016-November/047737.html
-
---- a/DESCRIPTION
-+++ b/DESCRIPTION
-@@ -10,7 +10,7 @@ Author: Paul J. McMurdie <joey711 at gmail.
- Susan Holmes <susan at stat.stanford.edu>, with
- contributions from Gregory Jordan and Scott Chamberlain
- License: AGPL-3
--Imports: BiocGenerics (>= 0.18.0), ade4 (>= 1.7.4), ape (>= 3.4),
-+Imports: BiocGenerics (>= 0.18.0), ade4 (>= 1.7-4), ape (>= 3.4),
- biomformat (>= 1.0.0), Biostrings (>= 2.40.0), cluster (>=
- 2.0.4), data.table (>= 1.9.6), foreach (>= 1.4.3), ggplot2 (>=
- 2.1.0), igraph (>= 1.0.1), methods (>= 3.3.0), multtest (>=
diff --git a/debian/patches/series b/debian/patches/series
deleted file mode 100644
index 3ab221a..0000000
--- a/debian/patches/series
+++ /dev/null
@@ -1 +0,0 @@
-fix_r-cran-ade4_versioning.patch
diff --git a/debian/rules b/debian/rules
deleted file mode 100755
index 68d9a36..0000000
--- a/debian/rules
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/usr/bin/make -f
-
-%:
- dh $@ --buildsystem R
diff --git a/debian/source/format b/debian/source/format
deleted file mode 100644
index 163aaf8..0000000
--- a/debian/source/format
+++ /dev/null
@@ -1 +0,0 @@
-3.0 (quilt)
diff --git a/debian/tests/control b/debian/tests/control
deleted file mode 100644
index b044b0c..0000000
--- a/debian/tests/control
+++ /dev/null
@@ -1,3 +0,0 @@
-Tests: run-unit-test
-Depends: @, r-cran-testthat
-Restrictions: allow-stderr
diff --git a/debian/tests/run-unit-test b/debian/tests/run-unit-test
deleted file mode 100644
index b7ac9a9..0000000
--- a/debian/tests/run-unit-test
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/sh -e
-
-oname=phyloseq
-pkg=r-bioc-`echo $oname | tr '[A-Z]' '[a-z]'`
-
-if [ "$ADTTMP" = "" ] ; then
- ADTTMP=`mktemp -d /tmp/${pkg}-test.XXXXXX`
- trap "rm -rf $ADTTMP" 0 INT QUIT ABRT PIPE TERM
-fi
-cd $ADTTMP
-cp -a /usr/share/doc/${pkg}/tests/* $ADTTMP
-LC_ALL=C R --no-save < testthat-phyloseq.R
-rm -fr $ADTTMP/*
diff --git a/debian/watch b/debian/watch
deleted file mode 100644
index 0685e13..0000000
--- a/debian/watch
+++ /dev/null
@@ -1,3 +0,0 @@
-version=4
-opts=downloadurlmangle=s?^(.*)\.\.?http:$1packages/release/bioc? \
- http://www.bioconductor.org/packages/release/bioc/html/phyloseq.html .*/phyloseq_([\d\.]+)\.tar\.gz
diff --git a/inst/CITATION b/inst/CITATION
new file mode 100644
index 0000000..2279501
--- /dev/null
+++ b/inst/CITATION
@@ -0,0 +1,13 @@
+citHeader("To cite phyloseq in publications, or otherwise credit, please use:")
+
+citEntry(entry = "article",
+ author = "Paul J. McMurdie and Susan Holmes",
+ journal = "PLoS ONE",
+ Pages = "e61217",
+ title = "phyloseq: An R package for reproducible interactive analysis and graphics of microbiome census data",
+ Volume = "8",
+ Number = "4",
+ year = "2013",
+ Url = "http://dx.plos.org/10.1371/journal.pone.0061217",
+ textVersion = "phyloseq: An R package for reproducible interactive analysis and graphics of microbiome census data. Paul J. McMurdie and Susan Holmes (2013) PLoS ONE 8(4):e61217."
+)
diff --git a/inst/NEWS b/inst/NEWS
new file mode 100644
index 0000000..689c6ae
--- /dev/null
+++ b/inst/NEWS
@@ -0,0 +1,1659 @@
+CHANGES IN VERSION 1.13.6
+-------------------------
+
+BUG FIXES
+
+ - droplevels suggestion for sample-data https://github.com/joey711/phyloseq/pull/476
+
+ - DESeq2 migrated to suggests https://github.com/joey711/phyloseq/pull/533
+
+ - `extend_metagenomeSeq` functionality https://github.com/joey711/phyloseq/pull/533
+
+ - bugs related to previous version distance uptick, mostly in tests and vignette
+
+CHANGES IN VERSION 1.13.5
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Help avoid cryptic errors due to name collision of `distance` with external loaded packages by making `distance` a formal S4 method in phyloseq.
+ - Improve documentation of `distance` function and the downstream procedures on which it depends
+ - Migrate the list of supported methods to a documented, exported list object, called `distanceMethodList`.
+ - Improved distance unit tests with detailed checks that dispatch works and gives exactly expected distance matrices for all methods defined in distanceMethodList.
+ - Improved JSD doc, performance, code, deprecated unnecessary `parallel` argument in JSD
+
+
+CHANGES IN VERSION 1.13.4
+-------------------------
+
+BUG FIXES
+
+ - `psmelt` bug if user has also loaded the original "reshape" package, due to name collision on the function called `melt`. `psmelt` now explicitly calls `reshape2::melt` to avoid confusion. https://github.com/joey711/phyloseq/pull/489
+
+ - Fix following note... There are ::: calls to the package's namespace in its code. A package almost never needs to use ::: for its own objects: ‘JSD.pair’
+
+CHANGES IN VERSION 1.11.3
+-------------------------
+
+BUG FIXES
+
+ - plot_heatmap bug when PCoA/MDS used as ordination method for axis ordering. This solves issue 420 https://github.com/joey711/phyloseq/issues/420
+
+CHANGES IN VERSION 1.11.2
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - `plot_heatmap` replaces `geom_tile` with `geom_raster`. This should afford a substantial speed improvement during rendering of heatmaps, according to ggplot2 documentation. This also solves Issue 401 https://github.com/joey711/phyloseq/issues/401
+
+CHANGES IN VERSION 1.11.1
+-------------------------
+
+BUG FIXES
+
+ - plot_heatmap y-axis
+
+CHANGES IN VERSION 1.9.15
+-------------------------
+
+BUG FIXES
+
+ - `phyloseq_to_deseq2` was adding an unnecessary pseudocount of `1` to the count matrix. No longer.
+
+ - Originally described at https://github.com/joey711/phyloseq/issues/387
+
+CHANGES IN VERSION 1.9.14
+-------------------------
+
+BUG FIXES
+
+ - `distance` erroneously transformed Rao distance results for method DPCoA. Now Fixed.
+
+ - Originally described at https://github.com/joey711/phyloseq/issues/390
+
+CHANGES IN VERSION 1.9.13
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - `distance` function now supports "wunifrac" option, for `UniFrac(..., weighted=TRUE)`
+
+ - `distance` function regexpr-matching for range of variants for weighted-UniFrac, unweighted-UniFrac method option
+
+ - `distance` function regexpr-matching for `type` argument range of alternatives
+
+ - Proposed in https://github.com/joey711/phyloseq/pull/384
+
+CHANGES IN VERSION 1.9.12
+-------------------------
+
+BUG FIXES
+
+ - `psmelt` function now properly handles single-OTU data
+
+ - Related to https://github.com/joey711/phyloseq/issues/338
+
+ - Builds on https://github.com/joey711/phyloseq/pull/373
+
+CHANGES IN VERSION 1.9.11
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - More robust `plot_ordination` behavior with clearer warning/error messages.
+
+ - Coordinates automatically checked/assigned to OTU or samples
+
+ - Attempt to calculate OTU or sample weighted-average coordinates via `vegan::wascores`, if-needed
+
+ - Species weighted-average via `vegan::wascores` supported now in phyloseq::scores.pcoa
+
+ - Related to https://github.com/joey711/phyloseq/pull/364
+
+CHANGES IN VERSION 1.9.10
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Massive speed/memory improvement for UniFrac calculations (via `UniFrac` or `distance`)
+
+ - Added unit-tests for the correctness of UniFrac results (no bugs detected. results from pycogent)
+
+ - Moved all unit tests to tests/testthat as recommended by CRAN maintainers and testthat doc
+
+CHANGES IN VERSION 1.9.9
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - `plot_net` faster, more-flexible network plot with improved defaults
+
+ - https://github.com/joey711/phyloseq/pull/353
+
+CHANGES IN VERSION 1.9.8
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - `mt` includes other corrections, FDR by default.
+
+ - Resolves [Issue 59](https://github.com/joey711/phyloseq/issues/59)
+
+CHANGES IN VERSION 1.9.7
+-------------------------
+
+BUG FIXES
+
+ - Now requires ggplot2 version 1.0.0
+
+ - Fixes bug in which ggplot 1.0 breaks in a phyloseq vignette
+
+ - Resolves [Issue 347](https://github.com/joey711/phyloseq/issues/347)
+
+CHANGES IN VERSION 1.9.6
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - New `sortby` argument in `plot_richness` function.
+
+ - Sort discrete x by one or more alpha-diversity measures
+
+ - Solves [Issue 342](https://github.com/joey711/phyloseq/issues/342)
+
+ - Resolves/merges [Pull 343](https://github.com/joey711/phyloseq/pull/343)
+
+CHANGES IN VERSION 1.9.5
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - `microbio_me_qiime` function now handles string study number for first argument.
+
+ This is in addition to numeric study number, already supported.
+
+CHANGES IN VERSION 1.9.4
+-------------------------
+
+BUG FIXES
+
+ - `rarefy_even_depth()` function no longer enforces an orientation.
+
+ - It used to always coerce to OTU-by-sample orientation.
+
+ - Solves Issue 320 https://github.com/joey711/phyloseq/issues/320
+
+CHANGES IN VERSION 1.9.3
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Massive Revision to plot_tree()
+
+ - `plot_tree` now uses the `psmelt` function
+
+ - All covariates are available for aesthetic mapping.
+
+ - `plot_tree` substantial speed improvement
+
+ - Uses native ape-package C code for tree computation
+
+ - Efficient `data.table` consolidated graphic data passed to ggplot2
+
+ - Additional arguments: `treetheme` and `justify`
+
+ - `tree_layout` - new, user-accessible function
+
+ - for building alternative trees from phyloseq data.
+
+ - Foundation for solving Issue 313 and Issue 331
+
+ - https://github.com/joey711/phyloseq/issues/313
+
+ - https://github.com/joey711/phyloseq/issues/331
+
+CHANGES IN VERSION 1.9.2
+-------------------------
+
+BUG FIXES
+
+ - Large files cause import_usearch_uc() to have error.
+ Error in paste0(readLines(ucfile), collapse = "\n") : result would exceed 2^31-1 bytes
+
+ - Solves Issue 327: https://github.com/joey711/phyloseq/issues/327
+
+CHANGES IN VERSION 1.9.1
+-------------------------
+
+BUG FIXES
+
+ - Bug in `psmelt` causing unnecessary error for phyloseq datasets with empty components.
+
+ - Solves Issue 319: https://github.com/joey711/phyloseq/issues/319
+
+CHANGES IN VERSION 1.7.24
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Added support for [Partial] Constrained Analysis of Principal Coordinates (CAP).
+
+ - A supported/documented option in `ordinate`, supported by `plot_ordination`.
+
+ - This solves [Issue 312](https://github.com/joey711/phyloseq/issues/312).
+
+ - The `ordinate` function now takes an explicit `formula` argument.
+
+ - This facilitates reliable contrained ordination calls for:
+
+ - CAP (this commit)
+
+ - RDA (partial redundancy analysis)
+
+ - CCA (constrained correspondence analysis)
+
+CHANGES IN VERSION 1.7.23
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Refactor `plot_ordination` to be more stable, error less and give informative warnings.
+
+ - Expect no critical API changes. Some errors now informative warnings with useful auto-changes to parameters.
+
+ - The `type='biplot'` option no longer hard-specifies a discrete color scale. Available default pallette should work.
+
+ - For `type='biplot'`, the non-variable (Taxa or Sample) label will always appear first in a discrete legend.
+
+CHANGES IN VERSION 1.7.22
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Revised psmelt to automatically modify data column-name conflicts, with warning
+
+ - Udpated `psmelt` doc to formally notify users of these potential conflicts.
+
+ - This solves Issue 307: https://github.com/joey711/phyloseq/issues/307
+
+CHANGES IN VERSION 1.7.21
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Updated `import_qiime` doc to emphasize it is intended for legacy QIIME files.
+
+ - Much faster and mem-efficient import of legacy QIIME and usearch files.
+
+ - Uses data.table syntax to better manage import of large files.
+
+ - Entire HMPv35 now imports in about 1 minute, low risk of mem-swap.
+
+ - Added dependency to data.table
+
+CHANGES IN VERSION 1.7.20
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - No user-visible changes. All future compatibility changes.
+
+BUG FIXES
+
+ - Unit test changes to work with upcoming R release and new testthat version.
+
+CHANGES IN VERSION 1.7.19
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Documentation revisions. Faster examples, updated links.
+
+CHANGES IN VERSION 1.7.18
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Fix minor bug that prohibited parallel execution of weighted UniFrac
+
+CHANGES IN VERSION 1.7.17
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Added `import_usearch_uc`
+
+ Added first-time support for usearch “.uc” style output table.
+ Addresses Issue 286, importing from UPARSE.
+ https://github.com/joey711/phyloseq/issues/286
+ Further feedback on performance, use-cases, should be posted there.
+
+CHANGES IN VERSION 1.7.16
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Fixed minor bug affecting legend-order in `plot_network`
+ Issue 288,
+ https://github.com/joey711/phyloseq/issues/288
+
+CHANGES IN VERSION 1.7.15
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - `tip_glom` now uses standard R clustering tools, and takes their arguments
+ documentation and tests updated to reflect the change
+ much simpler, faster
+
+ - `merge_taxa` now uses abundance to determine the achetype by default. Previously arbitrary.
+
+CHANGES IN VERSION 1.7.14
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Minor change in mixture model vignette, revised graphic
+
+CHANGES IN VERSION 1.7.13
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Deprecated originalUniFrac() internal function
+ old (original) unifrac algorithm no longer supported.
+ Addresses Issue 66:
+ https://github.com/joey711/phyloseq/issues/66
+
+CHANGES IN VERSION 1.7.12
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Formal deprecation of functions using .Deprecated
+ Issue 269,
+ https://github.com/joey711/phyloseq/issues/269
+
+ - Fixed bug in interface with vegan::fisher.alpha(..., se=TRUE).
+ vegan doc states that this returns a data.frame,
+ but a data.frame is not returned in vegan version 1.7.10.
+ phyloseq no checks output dimensions before processing in `estimate_richness`
+
+ - Replaced deprecated functions in tests and documentation.
+
+CHANGES IN VERSION 1.7.11
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Adds warning in make_network() and error in plot_network if empty graph encountered
+ Issue 275, check/warning for empty igraph objects
+ https://github.com/joey711/phyloseq/issues/275
+
+ - rarefy_even_depth() messages changed from cat() to messages(), and optional verbose argument added
+ https://github.com/joey711/phyloseq/issues/263
+
+CHANGES IN VERSION 1.7.10
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Fixes build-error originating from change in ade4 NAMESPACE in version 1.6.2
+
+ - Change minimum ade4 version to 1.6.2
+
+ - Uncommented examples now included in documentation for DPCoA function
+
+CHANGES IN VERSION 1.7.9
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Fixed typo-derived bug in new vignette.
+
+ - These changes allow user to build from source without error.
+
+CHANGES IN VERSION 1.7.8
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Package dependencies reduced/clarified:
+ https://github.com/joey711/phyloseq/issues/259
+ Should reduce chances for collisions with other packages, and related issues.
+ Removed any dependencies on the picante package.
+
+ - Replaced picante::node.age() with a faster implementation, node_ages()
+ Appears to be 3 times faster.
+ Speeds up UniFrac() and tip_glom() calculations.
+
+CHANGES IN VERSION 1.7.7
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Tree fixes:
+ https://github.com/joey711/phyloseq/issues/235
+ https://github.com/joey711/phyloseq/issues/255
+
+ - If a tree has NA branch-length values, they are automatically set to 0.
+ This occurs within both phyloseq(), and read_tree().
+
+ - UniFrac calculations require a rooted tree. While a rooted tree is
+ not required to be part of a phyloseq object, it is a helpful
+ default behavior to select a random root when UniFrac is called
+ and the tree is unrooted, flashing a notice to the user.
+
+ - Precise import from ape-package, rather than full-import.
+ Smaller chance for collisions.
+ Precisely-defined dependencies listed in NAMESPACE
+
+ - As a result of the previous, phyloseq defines a placeholder "phylo" class,
+ extended from "list". This seems to match the class
+ from a full import of ape, and is necessary since ape does not
+ export the "phylo" class.
+
+CHANGES IN VERSION 1.7.6
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Merged branches 1.7.4 and 1.7.5
+
+CHANGES IN VERSION 1.7.5
+-------------------------
+
+NEW FEATURES
+
+ - User-specified axis ordering to plot_heatmap()
+
+ - User-specified axis edges to plot_heatmap()
+
+ - This addresses:
+ [Issue 237](https://github.com/joey711/phyloseq/issues/237)
+ [Issue 230](https://github.com/joey711/phyloseq/issues/230)
+
+USER-VISIBLE CHANGES
+
+ - New arguments to plot_heatmap():
+ `taxa.order`, `sample.order`, `first.sample`, `first.taxa`
+
+CHANGES IN VERSION 1.7.4
+-------------------------
+
+NEW FEATURES
+
+ - import_mothur now handles more formats
+
+ - Added documentation to discourage .group/.list formats
+
+CHANGES IN VERSION 1.7.3
+-------------------------
+
+NEW FEATURES
+
+ - Added phyloseq_to_deseq2() wrapper function and examples for computing
+ multiple OTU tests using Negative Binomial model and GLM (DESeq2).
+
+USER-VISIBLE CHANGES
+
+ - Also added new .Rmd vignette for using DESeq, with colorectal carcinoma data
+
+CHANGES IN VERSION 1.7.2
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Reformat NEWS (this) file.
+
+CHANGES IN VERSION 1.7.1
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Rmd/HTML-based vignettes. No more Rnw/Sweave/PDF
+
+ - Updated installer
+
+CHANGES IN VERSION 1.5.23
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Update dependency from igraph0 to igraph
+
+ - Requested by CRAN
+
+ - igraph0 is being deprecated, removed from CRAN
+
+ - igraph is actively updated/maintained. igraph0 isn't / soon won't be.
+
+ - The load bug that I could find from re-running the tutorial in the new dependency
+ had to do with accessing the vertex names. The appropriate function is:
+ `get.vertex.attribute`
+ which was not used previous, but is now imported and used in this version of phyloseq.
+
+ - This also closes [Issue 247](https://github.com/joey711/phyloseq/issues/247)
+
+CHANGES IN VERSION 1.5.22
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Fix minor bug in `import_mothur`
+
+ - Bug occurs only when sample names are pure integers. One line fix.
+
+ - This addresses [Issue 242](https://github.com/joey711/phyloseq/issues/242)
+
+CHANGES IN VERSION 1.5.21
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - `plot_richness` / `estimate_richness` updates
+
+ - Additional alpha-diversity measures added to both functions.
+
+ - New argument, `measures`, added to both functions, allows user to
+ specify which measures to calculate/display.
+
+ - Also added unit tests for both functions, absent in prior versions
+
+
+CHANGES IN VERSION 1.5.20
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Bugfixes, `read_tree_greengenes`, and a `replace` optional parameter to rarefy_even_depth()
+
+ - read_tree_greengenes() is a new function created specifically to address the problem
+ created when the greengenes consortium chose to publish their official release trees
+ with semicolon-delimited node labels, even though semicolons are also a special newick character.
+ The semicolons only appear when a node has more than one taxonomic rank assigned to it,
+ which is rare in the smaller (lower OTU similarity threshold) trees, but a big problem
+ in all the most commonly used trees, e.g. 97% tree.
+ The hard part was identifying the precise cause of the problem. Now that it is precisely known,
+ the offending delimiters are temporarily replaced so that the standard parser `read.tree`
+ can create the phylo object, then they are reinstated.
+ Unit tests have been added to check that this works properly on a GreenGenes release tree
+ that otherwise breaks `read.tree`.
+ This also solves 224
+ https://github.com/joey711/phyloseq/issues/224
+
+ - Fixed a bug in which the highest taxonomic rank fails in `tax_glom`
+ This was a bug resulting from the automatic coercion of single-column
+ matrix subsetting to a vector. Simply changing the relevant line
+ of code in `tax_glom` such that [, , drop=FALSE],
+ solve this issue, Issue 223
+ https://github.com/joey711/phyloseq/issues/223
+ Unit tests have been added that should catch this in the future.
+
+ - rarefy_even_depth() new option to sample without replacement.
+ Two implications to consider are that
+ (1) sampling with replacement is faster and more memory efficient
+ as currently implemented;
+ (2) sampling with replacement means that there is a chance that the
+ number of reads for a given OTU in a given sample could be larger
+ than the original count value. This is in contrast to sampling without replacement
+ where the original count value is the maximum possible.
+ Prior to this phyloseq version,
+ this `replace` parameter did not exist and sampling with replacement was the only
+ random subsampling implemented in the `rarefy_even_depth` function.
+ This prior behavior was selected for computational efficiency, but
+ differs from the behavior of analogous functions in related packages
+ (e.g. rarefying in QIIME).
+
+CHANGES IN VERSION 1.5.19
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - add gap statistic support for ordination results
+
+ - gapstat_ord() - wrapper function for cluster::clusGap for ordination methods
+
+ - plot_clusgap() - wrapper function for plotting gap statistic results from clusGap()
+
+ - Full examples on the soilrep dataset for gap statistic.
+
+ - Fix bug in eigenvalue proportions for axes other
+ than the first two in ordination plots.
+
+CHANGES IN VERSION 1.5.17
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - replace URL with new phyloseq article
+ http://dx.plos.org/10.1371/journal.pone.0061217
+
+CHANGES IN VERSION 1.5.16
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - rarefy_even_depth improvements
+ User-side convenience upgrades to rarefy_even_depth() behavior.
+
+ - More user feedback about process.
+
+ - New argument for setting RNG seed, with a default value (reproducible by default).
+
+ - Reports the RNG seed being used for random subsampling, encouraging users to record this for reproducibility.
+
+ - Trims empty samples/OTUs automatically, and reports this to standard out.
+
+
+CHANGES IN VERSION 1.5.13-15 - build improvements, changed dependencies
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Shifted RJSONIO dependency to the new "biom" package in CRAN
+
+CHANGES IN VERSION 1.5.11-12 -
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - major feature: interface to microbio.me/qiime data repo
+
+ - Now possible to directly download, unpack, import multi-component data
+ into standard phyloseq form in R using a single command using the new
+ microbio_me_qiime()
+ command. Supports the following input styles:
+
+ - full URL to the precise study on the server,
+
+ - a local path to the same compressed raw data file on your system if you
+ already downloaded
+
+ - Just the study number on the microbio.me/qiime repo. For example:
+ microbio_me_qiime(524) will download and import the "smokers" dataset.
+
+ - See the following tutorial for more details:
+ http://joey711.github.io/phyloseq/download-microbio.me.html
+
+CHANGES IN VERSION 1.5.9 - minor bugfix
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Bugfix for psmelt()/plotting when entire columns in tax_table are empty
+
+CHANGES IN VERSION 1.5.8 - update reshape dependency to reshape2
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - This satisfies [issue 134](https://github.com/joey711/phyloseq/issues/134)
+
+CHANGES IN VERSION 1.5.6-.7 - plot_ordination()
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - plot_ordination() now includes percent variability on axis labels, if possible
+
+CHANGES IN VERSION 1.5.5
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Update citation info for latest per-reviewed phyloseq article
+
+ - http://dx.plos.org/10.1371/journal.pone.0061217
+
+CHANGES IN VERSION 1.5.4 - Add taxonomic classification data to mt() output, if available
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - This completes a feature request in [Issue 179](https://github.com/joey711/phyloseq/issues/179)
+
+CHANGES IN VERSION 1.5.3
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - bug fix in plot_heatmap labels derived from ggplot2 change
+
+ - This solves a bug described in [Issue 192](https://github.com/joey711/phyloseq/issues/192)
+
+CHANGES IN VERSION 1.5.2
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - bug fix pass extra args to `transform_sample_counts`
+
+CHANGES IN VERSION 1.5.0
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Level up to R version 3.0.0
+
+CHANGES IN VERSION 1.3.23
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - bug fix for transform_sample_counts
+
+ - Only appears to affect OTU tables in the "taxa are columns" orientation.
+ It is the result of a surprising behavior of the apply() function,
+ which in this circumstance transposes the table.
+
+ - This solves a bug described in [Issue 186](https://github.com/joey711/phyloseq/issues/186)
+
+CHANGES IN VERSION 1.3.22
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - additional tests for prun_*() and phyloseq() functions
+
+ - Before releasing v1.3.21 on GitHub:master some gaps in re-order checking were noticed.
+ These are now caught by new unit tests and phyloseq/prune/etc have been further revised to
+ ensure that properly-ordered OTUs are not disordered during a pruning step.
+
+CHANGES IN VERSION 1.3.21
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - accessor efficiency
+
+ - For instance, ntaxa() and taxa_names() are very slow on large dataset
+ Some of the highly inefficient approaches are now replaced.
+ The strategy is described further in [Issue 183](https://github.com/joey711/phyloseq/issues/183)
+
+
+CHANGES IN VERSION 1.3.20
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - reference sequence class update.
+
+ - formally define the new version of phyloseq-class with refseq slot included
+
+ - new/augmented accessors: taxa_names, ntaxa, refseq
+
+ - phyloseq() constructor now supports XStringSet components
+
+ - reconcile_species internal function removed/replaced
+
+ - reconcile_species removed from constructor and package. internal, so no need to deprecate.
+
+ - check the component slot list provided by the "splat" infrastructure
+
+ - prune_taxa
+
+ - Rebuild current example data.
+
+ - print/show methods.
+
+ - subset_taxa
+
+ - add refseq (XStringSet) object argument to import functions. import_qiime, import_biom
+
+ - merge_phyloseq: works with refseq data
+
+ - Add merge_taxa method for XStringSet objects
+
+ - Include reference sequences in example datasets
+
+ - prune_* / reconcile_* / intersect_* .
+ add intersect_samples() function, model after new version of intersect_species()
+ rename intersect_species to intersect_taxa()
+ rm/replace reconcile_* functions with prune_*(intersect_*(), ps).
+ These changes make simpler / DRYer code. Easier to extend.
+
+ - DRY (and hopefully speed) improvements to merge_taxa().
+ Additional speed improvements may be possible for tip_glom, tax_glom in later revisions
+
+CHANGES IN VERSION 1.3.14
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Shortened/replaced examples in the longest-running doc examples.
+ This substantially reduces the time it takes to run package checks.
+
+ - Focused on worst offenders
+
+ - Created a new online tutorial for `subset_ord_plot`.
+ Linked to it from `plot_ordination`
+
+ - `plot_taxa_bar` moved to deprecated function file where it belongs,
+ and examples completely removed from doc.
+
+
+CHANGES IN VERSION 1.3.13
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Fixed unofficial warning from missing (unregistered) classes
+ of ordination objects. Stems from the internal get_eigenvalue
+ being defined as S4 generic instead of S3.
+ Solves the following logged issue:
+ https://github.com/joey711/phyloseq/issues/166
+
+ - Name of get_eigenvalue generic replaced with extract_eigenvalue,
+ in addition to re-defining as S3 generic.
+
+ - Fixed ggplot2 warning from over-specified "bins" in the
+ plot_scree bar plot. Changed to stat="identity", solves problem
+
+ - Fixed warning when eigenvalues get (slightly) negative. They
+ are now set to 0.0 for the purposes of plotting in plot_scree.
+
+CHANGES IN VERSION 1.3.12
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Fixed a parsing issue for some QIIME-produced .biom files
+ that had leading space characters. Issue further described at
+ https://github.com/joey711/phyloseq/issues/171
+ Fixed such that any number of leading/lagging space characters
+ are removed from taxonomic classification entries
+
+ - Fixed build issue on some windows machines derived from problem
+ with figure files having colons in the filename.
+
+CHANGES IN VERSION 1.3.11
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Added tree option for import_biom() importer so that users can avoid
+ using merge_phyloseq() if their files are otherwise standard vanilla
+
+ - Address Issue 169 and 167
+ https://github.com/joey711/phyloseq/issues/169
+ https://github.com/joey711/phyloseq/issues/167
+
+CHANGES IN VERSION 1.3.10
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Added support for Shannon/Simpson alpha-diversity indices in plot_richness
+ https://github.com/joey711/phyloseq/issues/164
+
+ - All sample_data now embedded in plot_richness output graphic, in case want
+ to use other covariates in additional layers not originally specified.
+ For instance, if you wanted to include geom_text(label=addLayer1).
+ See plot_richness online tutorial.
+
+CHANGES IN VERSION 1.3.9
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Added ability to add node labels, bootstrap values to tree graphics
+ generated by plot_tree.
+
+ - The labeling itself is opened-up as a user-provided function to facilitate
+ custom node-labeling needs (including symbols and other ggplot2 geoms)
+
+ - Commonly-needed functions are provided as newly-documented exported
+ functions in the package:
+ nodeplotdefault - adds whatever is in the node label to the graphic
+ nodeplotboot - Adds the labels as bootstrap values, coercing/rounding as needed
+ nodeplotblank - Ensures that node labels are not added.
+
+ - These new functions can be used to give valid arguments to the new
+ `nodelabf` argument in plot_tree().
+
+ - Some other re-organization to plot_tree to show more code in
+ the main plot_tree function.
+
+CHANGES IN VERSION 1.3.8
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Fix bug in plot_tree graphic if sample names start with a number
+
+ - This fixes https://github.com/joey711/phyloseq/issues/149
+
+ - Also added fill argument to default aesthetic map definition, useful if fillable shapes defined in subsequent layers
+
+CHANGES IN VERSION 1.3.7
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Revisions to plot_tree to improve formatting, organization
+
+ - "ladderize" contributed by Gregory Jordan
+
+ - Color scale option removed from original pull request
+
+ - already supported as core ggplot2 functionality through layering
+
+ - Added links and roxygen2-header revisions for proper doc formatting
+
+ - Fixed bug in which alternative size-variables still labeled as "abundance" in legend.
+
+CHANGES IN VERSION 1.3.6
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Updated basics vignette
+
+CHANGES IN VERSION 1.3.5
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Updated README.md to point to phyloseq home page (instead of redundant display of content).
+
+ - Also added README.html
+
+CHANGES IN VERSION 1.3.4
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Updated tests, examples for merge_samples()
+
+ - Updated some dependency min versions
+
+ - First 2013 commit
+
+CHANGES IN VERSION 1.3.3
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - updates to import_qiime()
+
+ - modular building of taxonomy table in import_qiime() and import_biom()
+
+CHANGES IN VERSION 1.3.2
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - import_biom() fixes.
+
+ - More robust data handling for taxonomy table
+
+ - Flexibility: Can take custom parsing function for taxonomy vectors
+
+ - tests added for new parsing functions
+ parse_taxonomy_default
+ parse_taxonomy_greengenes
+
+CHANGES IN VERSION 1.3.1
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - New function: plot_bar() that is simpler more reliable than plot_taxa_bar.
+
+ - plot_taxa_bar now deprecated.
+
+ - labeled deprecated in title doc
+
+ - replaced examples in vignette with plot_bar examples
+
+CHANGES IN VERSION 1.3.0
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Level up github devel version number to reflect devel order/status relative to latest BioC release
+
+
+CHANGES IN VERSION 1.1.58
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Fixed a bug in import_qiime in which import fails if there are any empty taxonomy
+ string fields in a file that otherwise has some taxonomic assignments.
+ Now the taxonomy entries for that OTU (row in the taxonomy table) are left all NA_character_ by default.
+
+CHANGES IN VERSION 1.1.57
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Added automatic removal of single and double quotes from phylogenetic tree tip names provided to the phyloseq-constructor.
+ This should help avoid problems
+ importing data that includes a tree, for the cases where the taxa/OTU names don't match because of these extra quotation marks. Added directly into the
+ phyloseq() constructor, so that it doesn't matter how/where the tree was imported/add.
+
+ - Removal of quotes is only initiated when OTU/taxa names fail to match AT ALL
+ (intersection is length zero) between component taxa names.
+
+ - This feature needs to be added to unit tests as well.
+
+CHANGES IN VERSION 1.1.55
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Added "title" argument to plot_scree() and plot_ordination(…, type="scree"), for consistency.
+
+CHANGES IN VERSION 1.1.54
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Changed "species" label in different plot_ordination() graphics to "taxa"
+
+ - Fixed an issue with a default parameter in a related internal function
+ (changed "samples" to "sites" for vegan compatibility).
+
+CHANGES IN VERSION 1.1.53
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Fixed additional title issues in plot_network and plot_richness
+
+CHANGES IN VERSION 1.1.52
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Added plot_scree() function for making eigenvalue "scree" plots in ggplot2
+
+ - Added corresponding type="scree" option to plot_ordination, for convenience.
+
+CHANGES IN VERSION 1.1.51
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Returned to "Imports:" dependency for ggplot2 following a ggplot2 bug fix
+
+CHANGES IN VERSION 1.1.50
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Fixed several compatibility issues to support latest version of ggplot2 (0.9.2).
+
+ - Also changes plot_richness_estimates() to plot_richness().
+
+CHANGES IN VERSION 1.1.45
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Backward compatibility for import_qiime_sampleData,
+ now superseded by import_qiime_sample_data
+
+ - Added a functioning example based on the GlobalPatterns
+ example sample-map file included in the package extdata.
+
+CHANGES IN VERSION 1.1.44
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Fixed minor bug in tax_glom function.
+ Thanks to Katie Shelef for the bug report. Bug only affected tax_glom behavior when the
+ right-most rank was specified as the position for merging.
+
+CHANGES IN VERSION 1.1.43
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - fixed distance() issue from species/taxa replacement for type argument.
+
+CHANGES IN VERSION 1.1.42
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - fixed make_network/plot_network issue from species/taxa replacement for type argument.
+
+CHANGES IN VERSION 1.1.41
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Fixed documentation for `prune_taxa` and `prune_samples`
+
+ - Updated `prune_samples` method to allow for logical vectors.
+
+ - Fixed `prune_taxa` so that it properly fails with a message if the taxa argument is a logical of wrong length.
+ There was some potential (and no warning) for unpredictable vector-recycling
+ with short vectors in the old implementation.
+
+CHANGES IN VERSION 1.1.40
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Huge Update and Renaming Event.
+
+ - Made all functions use an *underscore* for English word delimiter, if they were using an abbreviation.
+
+ - Replaced "species" in all function names with "taxa".
+
+ - These changes are all backward compatible, for now, so your old code should work.
+ Let me know if it doesn't and I will quickly make the adjustment. This will remain true through the next official release,
+ but functional references to "species" will not be supported afterward,
+ except in the occasions where you actually mean taxonomic species, like `tax_glom(x, "species")`.
+
+CHANGES IN VERSION 1.1.33
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Revise taxglom() such that it handles phyloseq and taxonomyTable classes,
+ throws warning otherwise. It should not take a manually-produced character
+ vector, as this is roughly equivalent to functionality supported in other method,
+ especially prune_species()/merge_species().
+
+ - Also added unit-tests and executable examples for taxglom().
+ Got rid of taxglom.internal, incorporated directly into taxglom().
+ taxglom() is no longer an S4-method, and
+ doesn't need to be now that the character-vector argument option is omitted,
+ with S4-class handling delegated to merge_species().
+ Updated "taxTab<-" to be S4 assignment, clearer handling of taxonomy Table assignments,
+ especially useful for taxglom.
+
+CHANGES IN VERSION 1.1.29
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Add unit tests and example files for import_biom (as well as import("biom",...) ).
+
+CHANGES IN VERSION 1.1.28
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Added rarefy_even_depth() function for random subsampling of microbiome samples to the same number of reads.
+ Default uses the minimum total reads among the samples in the dataset. This is based on the core "sample" function,
+ which can have its random number generator fixed by set.seed for reproducibility.
+
+CHANGES IN VERSION 1.1.27
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Fix bug in plot_ordination that caused an error rather than produce unannotated plots when sampleData absent in the input.
+
+CHANGES IN VERSION 1.1.23-26
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Added unit tests and bugfixes
+
+CHANGES IN VERSION 1.1.19-22
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Improving import_qiime() importer to handle large datasets,
+ like the HMPv35 dataset, for example, while also providing useful status messages
+ during non-trivial imports that might take 10 minutes or more to complete.
+
+CHANGES IN VERSION 1.1.18
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Added replicate labels as a "Sample" factor in the soilrep dataset.
+
+CHANGES IN VERSION 1.1.17
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Fix possible bug that results from the latest version (0.6+) of igraph not being backward compatible.
+ A stable igraph0 package is available on CRAN as a stop-gap, and so all igraph dependencies were migrated
+ to "igraph0" until the phyloseq-source can be updated to match the igraph latest.
+
+CHANGES IN VERSION 1.1.15
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - plot_heatmap: Added default (but adjustable) threshold to omit taxa/sample labels
+
+CHANGES IN VERSION 1.1.14
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Update import_qiime() function to import latest non-BIOM qiime output files.
+ Also added check for presence of taxonomy information (consensus lineage).
+
+CHANGES IN VERSION 1.1.10
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Add plot_heatmap() function, for easy flexible heat maps built with ggplot2
+
+CHANGES IN VERSION 1.1.8-9
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Fix bug for some variants of new BIOM format
+
+ - Add import_RDP_otu() import function for new RDP pipeline export format
+
+CHANGES IN VERSION 1.1.7
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Removed the old plot_tree_phyloseq() function, in favor of the new ggplot2-based plot_tree()
+
+ - Uncommented / tested formal examples in documentation of plot-functions
+
+ - Updated variable names and doc for the plot_taxa_bar() function
+
+CHANGES IN VERSION 1.1.6
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Update vignette with plot_tree() example, replacing the
+ old base-graphics function, plot_tree_phyloseq().
+
+ - Fix bug in legend for trees with size mapped to abundance
+
+CHANGES IN VERSION 1.1.5
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Add initial version of tree_plot(), built with ggplot2
+
+ - Adds several internal functions borrowed from devel version of ggphylo
+
+CHANGES IN VERSION 1.1.4
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Add errorIfNULL option to auxiliary accessors
+ (e.g. sample.variables(), rank.names())
+
+CHANGES IN VERSION 1.1.1-3
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - R version updated to match Bioconductor, R-2.15.0+
+
+ - ape-package version updated to 3.0+
+
+ - ape-package now import dependency
+
+ - ggplot2-package version updated to 0.9.0+
+
+ - ggplot2-package now import dependency
+
+CHANGES IN VERSION 0.99.48
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Updated README.md
+
+CHANGES IN VERSION 0.99.47
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Add support for ordinate() to take dist-object instead of distance-method string.
+
+ - Update the documentation for ordinate() to reflect change.
+
+ - Updated README.md to describe new tools, distance() and ordinate()
+
+ - Update JSD documentation.
+
+CHANGES IN VERSION 0.99.46
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Added support and documentation for Jensen-Shannon Divergence to distance().
+
+ - Some updates to distance() function and its documentation.
+
+CHANGES IN VERSION 0.99.45
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Completely remove vegdist documentation from phyloseq by omitting all roxygen2
+ headers except for #' keywords internal
+
+ - This finally fixes it, without build warnings/errors, once and for all.
+
+CHANGES IN VERSION 0.99.44
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Make the phyloseq::vegdist() wrapper an internal function. Documentation updated as well.
+ This is a solution to Issue 87
+ https://github.com/joey711/phyloseq/issues/87
+
+ - Remove rda.phyloseq and cca.phyloseq from exported methods.
+ Documentation had already been removed when these were first attempted to be converted
+ to internal methods in CHANGES IN VERSION 0.99.42
+
+ - Modify make_sample_network to use the new distance() function, and be able to pass on
+ additional parameters to distance().
+
+ - Re-order arguments to make_sample_network to better represent the stability of defaults.
+
+ - Removed redundant parameters from make_sample_network(). Transformations to abundance
+ tables should be performed upstream using phyloseq tools, not embedded in this function.
+
+ - Modify vignettes to reflect these changes.
+
+ - Rebuild vignette.
+
+ - Update import() documentation to reflect support for BIOM format, import_biom() function.
+
+CHANGES IN VERSION 0.99.43
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Revise component assignment operators, e.g. "sampleData(physeq)<-" to coerce/access
+ from more diverse objects based on context. Documentation for these operators updated.
+ This is a more general solution to Issue 68
+ https://github.com/joey711/phyloseq/issues/68
+
+CHANGES IN VERSION 0.99.42
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - purge rda.phyloseq and cca.phyloseq from documentation, vignettes. Internal functions.
+
+ - analysis vignette revisions
+
+CHANGES IN VERSION 0.99.41
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Added ordinate() function, general ordination wrapper.
+
+CHANGES IN VERSION 0.99.40
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Fixed case issues with source filenames.
+ Required multiple commits, break in versioning for source files that had improper letter case (e.g. ".r")
+
+CHANGES IN VERSION 0.99.37 - 0.99.39
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Vignette updates.
+
+CHANGES IN VERSION 0.99.36
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Added general distance calculation wrapper, distance()
+
+ - Consolidated it with unifrac.R code in a new "distance-methods.R" source file.
+
+CHANGES IN VERSION 0.99.35
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Added tool to retain user-defined subset of points from an ordination, subset_ord_plot
+
+ - Updates to vignette
+
+ - Remove a poorly-documented, unpublished example dataset, ex1 -> and all references to it in documentation
+
+CHANGES IN VERSION 0.99.34
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Updates to vignette
+
+CHANGES IN VERSION 0.99.33
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Allow color specification for "biplot" type in plot_ordination()
+
+ - bug fix for certain combinations of biplot options.
+
+CHANGES IN VERSION 0.99.32
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Remove plot_ordination_biplot() from phyloseq -> replaced by: plot_ordination()
+
+ - pipes "|->" also removed from vignette.
+
+ - Remove calcplot(). -> Will be replaced by pipeline_() methods
+
+ - plot_ordination() shown in vignette for "species" plot type.
+
+ - Fix - Allow flexible axis labels for plot_ordination()
+
+CHANGES IN VERSION 0.99.31
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Aesthetic fixes in plot_ordination()
+
+ - Remove plot_ordination_samples() -> redundant, covered by plot_ordination().
+
+CHANGES IN VERSION 0.99.30
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Added plot_ordination -> general ordination plotting function
+
+CHANGES IN VERSION 0.99.29
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Added first-draft of analysis vignette to package
+
+ - Total package-space and build/check times tested
+
+CHANGES IN VERSION 0.99.28
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Fixed bug when specifying alternative axes in plot_ordination_samples()
+
+CHANGES IN VERSION 0.99.27
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Added DPCoA() function for Double Principle Coordinate Analysis, relies heavily on ade4.
+
+ - Added ade4 dependency. Some masking, but no errors apparent.
+
+ - Added dpcoa extension for scores()
+
+ - minor bugfix in plot_ordination_biplot().
+
+CHANGES IN VERSION 0.99.26
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Add plot_ordination_samples() for convenient ordination plotting
+
+ - Renamed plot_ordination_phyloseq to more precise, plot_ordination_biplot()
+
+ - Added extension to vegan::scores for ape:pcoa results, scores.pcoa()
+
+CHANGES IN VERSION 0.99.25
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Fix build warning from duplicate import in namespace
+
+ - Update README.md
+
+CHANGES IN VERSION 0.99.24
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Advanced network/graph plotting/visualization wrappers: make_sample_network(), plot_sample_network()
+
+ - Advanced alpha diversity wrappers: plot_richness_estimates(), estimate_richness()
+
+ - Added taxafilter() function for even more convenient filtering of species/taxa
+
+ - Added reshape dependency for explicit use of melt() function.
+
+CHANGES IN VERSION 0.99.23
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Added a rooted tree to "GlobalPatterns" dataset
+
+ - Allow multi-class tests with mt()
+
+CHANGES IN VERSION 0.99.20
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Added "GlobalPatterns" dataset, and some other minor improvements
+
+ - added "GlobalPatterns" dataset, with doc and some analysis
+ examples. More examples will be added to the vignette.
+
+ - add getVariable accessor function, for streamlining access
+ to values/vectors/factors/etc of the variates contained in
+ the sampleData component
+
+ - revise documentation about parallelization of import_biom()
+
+ - revise documentation for enterotype dataset
+
+CHANGES IN VERSION 0.99.19
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - fixed a bug in general import() function in which it properly processed import command, but then failed to return result.
+
+CHANGES IN VERSION 0.99.18
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Added import_biom() function to import BIOM format OTU-clustered data / metadata
+
+ - Added (direct) dependency for plyr, as well as RJSONIO
+
+CHANGES IN VERSION 0.4.2-7
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Improvements to import_mothur() via import_mothur_otulist().
+
+ - merge_samples() function added, in preparation for hypergeometric (fisher.test) test wrapper.
+
+ - Fixed bug in merge_phyloseq() for certain combinations of objects.
+
+ - Various other fixes and improvements.
+
+CHANGES IN VERSION 0.4.1
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - First official submission to Bioconductor
+
+ - Includes many build-fixes, bug-fixes
+
+ - Final tweaks to conform to Bioconductor guidelines
+
+CHANGES IN VERSION 0.3
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Added Importers for mothur, RDP-pipeline, and PyroTagger
+
+ - Lots of new plotting and analysis support
+
+ - Parallelized UniFrac calculations using foreach package
+
+ - Some improvements to the class inheritance structure to simplify method extension and future development.
+
+ - Function-level documentation
+
+ - Vignette-level documentation
+
+ - Package dependencies achieved via import for all packages with a namespace. Depends (full load) otherwise.
+
+CHANGES IN VERSION 0.2.4
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Added ordination and plotting pipeline, calcplot() and plot.ordination.phyloseq()
+
+ - All trees converted to phylo4 by default. Support for phyla achieved by coercion to phylo4.
+
+CHANGES IN VERSION 0.2
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - Modifications for proper build, and bug tests.
+
+CHANGES IN VERSION 0.1
+-------------------------
+
+USER-VISIBLE CHANGES
+
+ - basic data structure in place.
+
+ - some support for phylo4 objects
+
+ - import_qiime(): A Qiime input wrapper
+
diff --git a/inst/doc/Unweighted_UniFrac.RData b/inst/doc/Unweighted_UniFrac.RData
new file mode 100644
index 0000000..4013f3e
Binary files /dev/null and b/inst/doc/Unweighted_UniFrac.RData differ
diff --git a/inst/doc/phyloseq-FAQ.R b/inst/doc/phyloseq-FAQ.R
new file mode 100644
index 0000000..fb1d445
--- /dev/null
+++ b/inst/doc/phyloseq-FAQ.R
@@ -0,0 +1,31 @@
+## ---- warning=FALSE, message=FALSE---------------------------------------
+library("phyloseq"); packageVersion("phyloseq")
+library("ggplot2"); packageVersion("ggplot2")
+theme_set(theme_bw())
+
+## ------------------------------------------------------------------------
+data(esophagus)
+plot_tree(esophagus)
+
+## ------------------------------------------------------------------------
+p1 = plot_tree(esophagus, color = "Sample")
+p1
+p1 +
+ ggtitle("This is my title.") +
+ annotate("text", 0.25, 3,
+ color = "orange",
+ label = "my annotation")
+
+## ------------------------------------------------------------------------
+data("esophagus")
+mdf = psmelt(esophagus)
+# Simple bar plot. See plot_bar() for more.
+ggplot(mdf, aes(x = Sample,
+ y = Abundance)) +
+ geom_bar(stat = "identity", position = "stack", color = "black")
+# Simple heat map. See plot_heatmap() for more.
+ggplot(mdf, aes(x = Sample,
+ y = OTU,
+ fill = Abundance)) +
+ geom_raster()
+
diff --git a/inst/doc/phyloseq-FAQ.Rmd b/inst/doc/phyloseq-FAQ.Rmd
new file mode 100644
index 0000000..089c143
--- /dev/null
+++ b/inst/doc/phyloseq-FAQ.Rmd
@@ -0,0 +1,562 @@
+---
+title: "phyloseq Frequently Asked Questions (FAQ)"
+date: "`r date()`"
+author: "Paul McMurdie and Susan Holmes"
+output:
+ BiocStyle::html_document:
+ fig_height: 7
+ fig_width: 10
+ toc: yes
+ toc_depth: 2
+ number_sections: true
+vignette: >
+ %\VignetteIndexEntry{phyloseq Frequently Asked Questions (FAQ)}
+ %\VignetteEngine{knitr::rmarkdown}
+ %\VignetteEncoding{UTF-8}
+---
+
+
+This vignette includes answers and supporting materials that address
+[frequently asked questions (FAQs)](https://en.wikipedia.org/wiki/FAQ),
+especially those posted on
+[the phyloseq issues tracker](https://github.com/joey711/phyloseq/issues).
+
+For most issues
+[the phyloseq issues tracker](https://github.com/joey711/phyloseq/issues)
+should suffice; but occasionally there are questions
+that are asked repeatedly enough that it becomes appropriate
+to canonize the answer here in this vignette.
+This is both
+(1) to help users find solutions more quickly, and
+(2) to mitigate redundancy on
+[the issues tracker](https://github.com/joey711/phyloseq/issues).
+
+All users are encouraged to perform a google search
+and review other questions/responses to both open and closed issues
+on [the phyloseq issues tracker](https://github.com/joey711/phyloseq/issues)
+before seeking an active response by posting a new issue.
+
+
+```{r, warning=FALSE, message=FALSE}
+library("phyloseq"); packageVersion("phyloseq")
+library("ggplot2"); packageVersion("ggplot2")
+theme_set(theme_bw())
+```
+
+
+# - I tried reading my biom file using phyloseq, but it didn’t work. What’s wrong?
+
+The most common cause for this errors
+is derived from a massive change to the way biom files are stored on disk.
+There are currently two "versions" of the biom-format,
+each of which stores data very differently.
+The original format -- and original support in phyloseq --
+was for biom-format version 1 based on [JSON](https://en.wikipedia.org/wiki/JSON).
+
+The latest version -- version 2 -- is based on the
+[HDF5](https://www.hdfgroup.org/HDF5/doc/UG/index.html) file format,
+and this new biom format version
+recently become the default file output format
+for popular workflows like QIIME.
+
+## Good News: HDF5-biom should be supported in next release
+
+The *biomformat* package is the Bioconductor incarnation
+of R package support for the biom file format,
+written by Paul McMurdie (phyloseq author)
+and Joseph Paulson (metagenomeSeq author).
+Although it has been available on GitHub and BioC-devel
+for many months now,
+the first release version of *biomformat*
+on Bioconductor will be in April 2016.
+In that same release, phyloseq will switch over
+from the JSON-only *biom* package hosted on CRAN
+to this new package, *biomformat*,
+which simultaneously supports biom files
+based on either HDF5 or JSON.
+
+This difference will be largely opaque to users,
+and phyloseq will "just work" after the next release in April.
+
+Use the `import_biom` function to read your recent
+QIIME or other biom-format data.
+
+Additional back details are described in
+[Issue 443](https://github.com/joey711/phyloseq/issues/443).
+
+## HDF5 (Version 2.0) biom-format: *biomformat*
+
+As just described,
+HDF5 biom format is currently supported
+in the development version of phyloseq,
+via the new beta/development package called *biomformat*
+on BioC-devel and GitHub:
+
+https://github.com/joey711/biomformat
+
+If you need to use HDF5-based biom format files **immediately**
+and cannot wait for the upcoming release,
+then you should install the development version
+of the *biomformat* package by following the instructions
+at the link above.
+
+## Not every data component is included in .biom files
+
+Even though the biom-format supports the self-annotated inclusion
+of major components like that taxonomy table and sample data table,
+many tools that generate biom-format files
+(like QIIME, MG-RAST, mothur, etc.)
+do not export this data, even if you provided
+the information in your data input files.
+The reason for this boggles me,
+and I've shared my views on this with QIIME developers,
+but there nevertheless seems to be no plan to include your sample data
+in the ouput biom file.
+
+Furthermore, even though I have proposed it to the biom-format team,
+there is currently no support (or timeline for support)
+for inclusion of a phylogenetic tree within a ".biom" file.
+
+A number of tutorials are available
+demonstrating how one can add components to a phyloseq object
+after it has been created/imported.
+The following tutorial is especially relevant
+
+http://joey711.github.io/phyloseq-demo/import-biom-sd-example.html
+
+Which makes use of the following functions:
+
+- `import_qiime_sample_data`
+- `merge_phyloseq`
+
+## Other issues related the biom-format
+
+There are a number of different Issue Tracker posts
+discussing this format with respect to phyloseq:
+
+https://github.com/joey711/phyloseq/issues/302
+
+https://github.com/joey711/phyloseq/issues/272
+
+https://github.com/joey711/phyloseq/issues/392
+
+[Issue 443](https://github.com/joey711/phyloseq/issues/443)
+has details for updated format.
+
+
+# - `microbio_me_qiime()` returned an error. What’s wrong?
+
+## The QIIME-DB Server is Permanently Down.
+
+The QIIME-DB server is permanently down.
+
+Users are suggested to migrate their queries over to Qiita.
+
+Indeed, the previous link to
+[microbio.me/qiime](http://www.microbio.me/qiime/index.psp)
+now sends users to the new Qiita website.
+
+## An interface to Qiita is Planned.
+
+Stay tuned. The Qiita API needs to be released by the Qiita developers first.
+The phyloseq developers have no control over this,
+as we are not affiliated directly with the QIIME developers.
+Once there is an official Qiita API with documentation,
+an interface for phyloseq will be added.
+
+We found the `microbio_me_qiime()` function
+to be very convenient while the QIIME-DB server lasted.
+Hopefully an equivalent is hosted soon.
+
+
+# - I want a phyloseq graphic that looks like...
+
+Great!
+
+**Every plot function in phyloseq returns a ggplot2 object**.
+When these objects are "printed" to standard output in an R session,
+for instance,
+
+```{r}
+data(esophagus)
+plot_tree(esophagus)
+```
+
+then the graphic is rendered in
+[the current graphic device](https://stat.ethz.ch/R-manual/R-devel/library/grDevices/html/Devices.html).
+
+Alternatively, if you save the object output from a phyloseq `plot_` function
+as a variable in your session,
+then you can further modify it, interactively, at your leisure.
+For instance,
+
+```{r}
+p1 = plot_tree(esophagus, color = "Sample")
+p1
+p1 +
+ ggtitle("This is my title.") +
+ annotate("text", 0.25, 3,
+ color = "orange",
+ label = "my annotation")
+```
+
+There are lots of ways for you to generate custom graphics
+with phyloseq as a starting point.
+
+The following sections list some of my favorites.
+
+## Modify the ggplot object yourself.
+
+For example,
+[the plot_ordination() examples tutorial](http://joey711.github.io/phyloseq/plot_ordination-examples.html)
+provides several examples of using additional ggplot2 commands
+to modify/customize the graphic encoded in the ggplot2 object
+returned by `plot_ordination`.
+
+[The ggplo2 documentation](http://docs.ggplot2.org/current/)
+is the current and canonical online reference
+for creating, modifying, and developing with ggplot2 objects.
+
+For simple changes to aesthetics and aesthetic mapping,
+[the aesthetic specifications vignette](http://docs.ggplot2.org/current/vignettes/ggplot2-specs.html)
+is a useful resource.
+
+
+## psmelt and ggplot2
+
+The `psmelt` function converts your phyloseq object
+into a table (`data.frame`)
+that is very friendly for defining a custom ggplot2 graphic.
+This function was originally created
+as an internal (not user-exposed) tool
+within phyloseq to enable
+a [DRY](https://en.wikipedia.org/wiki/Don%27t_repeat_yourself)
+approach to building ggplot2 graphics
+from microbiome data represented as phyloseq objects.
+
+When applicable, the phyloseq `plot_` family of functions
+use `psmelt`.
+This function is now a documented
+and user-accessible function in phyloseq --
+for the main purpose of enabling users
+to create their own ggplot2 graphics as needed.
+
+There are lots of great documentation examples for ggplot2 at
+
+- [the ggplot2 official documentation site](http://docs.ggplot2.org/current/),
+- [ggplot2 on StackOverflow](http://stackoverflow.com/tags/ggplot2), and
+- [phyloseq documentation pages](https://joey711.github.io/phyloseq/).
+
+The following are two very simple examples of using
+`psmelt` to define your own ggplot2 object "from scratch".
+It should be evident that you could include further ggplot2 commands
+to modify each plot further, as you see fit.
+
+```{r}
+data("esophagus")
+mdf = psmelt(esophagus)
+# Simple bar plot. See plot_bar() for more.
+ggplot(mdf, aes(x = Sample,
+ y = Abundance)) +
+ geom_bar(stat = "identity", position = "stack", color = "black")
+# Simple heat map. See plot_heatmap() for more.
+ggplot(mdf, aes(x = Sample,
+ y = OTU,
+ fill = Abundance)) +
+ geom_raster()
+```
+
+## Submit a Pull Request (Advanced)
+
+If your new custom plot function is awesome and you think others might use it,
+add it to the `"plot-methods.R"` source file
+and submit a pull request on GitHub.
+
+[GitHub Official Pull Request Documentation](https://help.github.com/articles/using-pull-requests/)
+
+Please include example and test code
+in the code included in your pull request.
+
+I'll try and add it to the package by the next release.
+I will also give you authorship credit in the function doc.
+See the "typo fix" section below for further details about GitHub pull requests...
+
+## Define a ggplot2 extension (Advanced)
+
+Development of new R functions/commands
+for creating/modifying new geometric objects
+is now formally documented in
+[the ggplot2 extension vignette](http://docs.ggplot2.org/current/vignettes/extending-ggplot2.html).
+
+This may be related to the previous section,
+in that your ggplot2 extension for phyloseq
+could be contributed to the phyloseq project as a pull request.
+
+
+# - There’s a typo in phyloseq documentation, tutorials, or vignettes
+
+This is something that is actually faster and less work
+for you to solve yourself
+and contribute back to the phyloseq package.
+For trivial typo fixes,
+I will quickly include your fixes into the package code.
+Sometimes I accept them on my cell phone
+while I'm still in bed.
+No wasted time on either end! :-)
+
+The point is that this should be simple,
+and is simple if you follow one of the following suggestions.
+
+## Fix the typo directly on GitHub
+
+GitHub now provides the option to make changes
+to code/text of a repository
+directly from your web browser through an in-page editor.
+This handles all the Git details for you.
+If you have a GitHub account and you're logged in,
+all you'd have to do is locate the file with the offending typo,
+then use the "edit" button to
+make the changes and
+send the to me as a pull request.
+
+## Minimal GIT and GitHub Exercise
+
+![](http://i.imgur.com/j9NYXiQ.png)
+
+(The following instructions are borrowed
+from [Yihui Xie's site about fixing typos](http://yihui.name/en/2013/06/fix-typo-in-documentation/))
+
+Alternatively, for those who want to try GIT and Github pull requests,
+which make it possible for you to contribute to open source
+and fix obvious problems with no questions being asked --
+just do it yourself, and send the changes to the original author(s) through Github.
+
+The official documentation for Github pull requests
+is a little bit verbose for beginners.
+Basically what you need to do for simple tasks are:
+
+1. click the Fork button and clone the repository in your own account;
+2. make the changes in your cloned version;
+3. push to your repository;
+4. click the Pull Request button to send a request to the original author;
+
+
+
+# - I read ["Waste Not, Want Not..."](http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1003531) but...
+
+Before getting to more specific issues,
+let's start by keeping appropriately separate the concept of
+
+- (1) denoising amplicon sequences, and/or denoising features in the contingency table, and
+- (2) standardization
+
+These two concepts have been often-conflated --
+mostly by purveyors of methods that use rarefying --
+wrongly insisting that rarefying is somehow addressing both problems
+and the matter is settled.
+Unfortunately rarefying is a very inefficient, noise-introducing method
+that poorly addresses the data analysis challenges that motivate either concept.
+
+DESeq2 and related solutions can help you address
+the need for standardization (e.g. differing library sizes)
+at a particular step in your analysis
+while still making efficient inferences from your data.
+
+The denoising problem is best addressed at the sequence-processing level,
+and the best general-purpose option currently available is:
+
+- [The dada2 algorithm](http://benjjneb.github.io/dada2/), if your data works well with it. Current support is mainly Illumina sequence data, or
+- [UPARSE](http://drive5.com/uparse/) in the usearch package, if you don't have sequencing data that works well with [dada2](http://benjjneb.github.io/dada2/)
+
+
+## I tried to [use DESeq2](http://joey711.github.io/phyloseq-extensions/DESeq2.html) to normalize my data, but now I don't know what to do...
+
+The answer to a question of this category depends a lot on your experiment,
+and what you want to learn from your data.
+The following are some resources that may help.
+
+- [Waste Not, Want Not Supplemental Materials](http://joey711.github.io/waste-not-supplemental/)
+- [Differential Abundance Vignette](https://www.bioconductor.org/packages/release/bioc/vignettes/phyloseq/inst/doc/phyloseq-mixture-models.html)
+- [The phyloseq front page](https://joey711.github.io/phyloseq/)
+
+
+## My libraries/samples had different total number of reads, what do I do?
+
+That is an expected artifact of current sequencing technologies,
+and not a "problem" on its own.
+In most cases, differences in total counts are uncorrelated
+with any variable in your experimental design.
+**You should check that this is the case**.
+It remains possible that there are structural/procedural artifacts
+in your experiment that have influenced the total counts.
+If library sizes are correlated with one of your design variables,
+then this *might* represent an artifact that you need to address more carefully.
+This is a decision that you will have to make and defend.
+No software package or workflow can address this for you,
+but phyloseq/R can certainly help you check for correlation.
+See the `sample_sums()` and `sample_data()` accessor functions.
+
+Other than the portent of structural biases in your experiment,
+you should recall that
+comparisons between observation classes that have
+**uneven sample sizes is not a new nor unsolved problem in statistics**.
+
+The most useful analytical methods you can use in this context
+are therefore methods that expect and account
+for differences in total number of reads between samples.
+
+How you account for these *library size* differences
+should depend on the type of analysis in which you are engaged,
+and which methods you plan to use.
+For instance, for a beta-diversity measure like
+Bray-Curtis Dissimilarity,
+you might simply use the relative abundance of each taxa in each sample,
+as the absolute counts are not appropriate to use directly
+in the context where count differences are not meaningful.
+
+For further information, see
+
+- ["Waste Not, Want Not..."](http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1003531)
+- [Discussion for Issue 229](https://github.com/joey711/phyloseq/issues/229)
+- [Discussion for Issue 299](https://github.com/joey711/phyloseq/issues/299)
+
+## Should I normalize my data before alpha-diversity analysis
+
+**No.** Generally speaking, the answer is **no**.
+Most alpha diversity methods will be most effective
+when provided with the originally-observed count values.
+
+The misleading notion --
+that normalization is necessary
+prior to alpha-diversity analysis --
+seems to be derived from various
+"one size fits all" pipeline tools like QIIME,
+in which it is often encouraged to
+[*rarefy*](http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1003531)
+counts as a normalizing transformation prior to any/all analysis.
+While this may simplify certain aspects of pipeline software development,
+it is analytical and statistical folly.
+**Rarefying microbiome data is statistically inadmissible**.
+
+For further information, I suggest reviewing literature such as
+
+- [Gotelli Colwell (2001)](http://onlinelibrary.wiley.com/doi/10.1046/j.1461-0248.2001.00230.x/abstract;jsessionid=A5EF264ABB5EADD5CCE9EF3AEE50CA41.f01t03), and of course,
+- ["Waste Not, Want Not..."](http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1003531)
+
+
+## Negative numbers in my transformed data table?
+
+This sort of question usually appears after someone used
+a log-like transformation / variance stabilizing transformation
+on their data,
+in preparation for an exploratory analysis via ordination.
+Negative values in this context probably correspond
+to **"less than one count"** after rescaling.
+For many ordination methods,
+like [PCA](https://en.wikipedia.org/wiki/Principal_component_analysis),
+negative numbers are not a problem.
+
+Instead, the problem is often posed because a user
+also wants to use **a particular distance measure**
+that is undefined or unstable in the presence of negative entries.
+In this context, however, the more negative a value is,
+the more likely that it was zero, or very small,
+in the original "raw" count matrix.
+For most distances and hypotheses, these values
+are probably not very important, or even negligible.
+Given this, it is probably quite reasonable to do one of the following:
+
+(1) Set to zero all values less than zero.
+If `X` is your matrix, you can accomplish this with
+`X[X < 0.0] <- 0.0`
+(2) Add a pseudocount prior to data transformation.
+This often curbs or prevents the presence of zeroes
+in the table of transformed values.
+Some people don't like this approach for their dataset,
+and they may or may not be correct.
+It is up to you to decide for your data.
+See [Discussion on Issue 445](https://github.com/joey711/phyloseq/issues/445).
+
+Please also note that taxa entries that are all negative after transformation,
+or equivalently are very small or almost always zero,
+should probably be filtered from your data
+prior to analysis.
+There are many different reasons for this.
+
+
+## I get an error regarding geometric mean
+
+See my [SO post on alternative geometric mean functions in R](http://stackoverflow.com/a/25555105/935950)
+There are several examples for alternative calculations of geometric mean,
+and some of these might solve the problem of having an error.
+
+See also the discussion on [Issue 445](https://github.com/joey711/phyloseq/issues/445)
+regarding geometric means.
+
+Alternative library size estimators may be appropriate for your data,
+and it remains your responsibility
+to determine if any specific approach is valid.
+
+Mike Love (a developer for DESeq2), suggested the following consideration:
+
+"On the other hand, very sparse count datasets,
+with large counts for single samples per row and the rest at 0,
+don't fit well to the negative binomial distribution.
+Here, the VST or simply shifted log, `log(count+k)`,
+might be a safer choice than the `rlog`.
+A way that I test for sparsity is looking at a plot
+of the row sum of counts and the proportion of count
+which is in a single sample."
+
+
+## Pseudocounts are not appropriate for my data, because...
+
+See [Discussion on Issue 445](https://github.com/joey711/phyloseq/issues/445).
+
+Also, think carefully about what you mean here.
+I suspect this statement could be more accurately stated as,
+*pseudocounts are not appropriate for my experiment, data, and the analysis step I was about to perform*.
+Your position in this case is thus based on a combination
+of how the data appears to behave,
+and your knowledge of how pseudocounts would affect
+the analysis you were going to use.
+Consider the following.
+
+- Is there an alternative analysis method?
+- Is the method you were about to use really that sensitive to adding a pseucocount?
+- Is a pseudocount really needed, or were you copying/pasting this step
+to an analysis script that you found somewhere?
+
+
+## I’m scared that the Negative Binomial doesn’t fit my data well
+
+See [Discussion on Issue 445](https://github.com/joey711/phyloseq/issues/445).
+
+
+## I don’t know how to test for differential abundance now. How do I do that?
+
+There is now lots of documentation on this topic.
+
+For starters, please see
+[the phyloseq vignette devoted to this topic](http://bioconductor.org/packages/release/bioc/vignettes/phyloseq/inst/doc/phyloseq-mixture-models.html).
+
+A Google search for "phyloseq differential abundance"
+will also likely turn up a number of useful, related resources.
+
+
+# - I need help analyzing my data. It has the following study design...
+
+I am currently a biostatistician at Second Genome, Inc.,
+which offers complete
+[end-to-end microbiome experiment solutions](http://www.secondgenome.com/solutions)
+as a fee-for-service.
+In some cases Second Genome clients already have their microbiome data
+and want to make use of our team of trained microbiome analysts
+to get the most information from their expeirment.
+I recommend contacting one of the sales associates at the link above.
+
+My day-to-day efforts are in understanding the role of the microbiome
+in human health and disease.
+If you're looking for a collaboration on your microbiome
+data collection or data analysis,
+please contact [Second Genome Solutions](http://www.secondgenome.com/solutions).
diff --git a/inst/doc/phyloseq-FAQ.html b/inst/doc/phyloseq-FAQ.html
new file mode 100644
index 0000000..5d9895e
--- /dev/null
+++ b/inst/doc/phyloseq-FAQ.html
@@ -0,0 +1,347 @@
+<!DOCTYPE html>
+
+<html xmlns="http://www.w3.org/1999/xhtml">
+
+<head>
+
+<meta charset="utf-8">
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+<meta name="generator" content="pandoc" />
+
+<meta name="author" content="Paul McMurdie and Susan Holmes" />
+
+
+<title>phyloseq Frequently Asked Questions (FAQ)</title>
+
+<link href="data:text/css;charset=utf-8,pre%20%2Eoperator%2C%0Apre%20%2Eparen%20%7B%0Acolor%3A%20rgb%28104%2C%20118%2C%20135%29%0A%7D%0Apre%20%2Eliteral%20%7B%0Acolor%3A%20%23990073%0A%7D%0Apre%20%2Enumber%20%7B%0Acolor%3A%20%23099%3B%0A%7D%0Apre%20%2Ecomment%20%7B%0Acolor%3A%20%23998%3B%0Afont%2Dstyle%3A%20italic%0A%7D%0Apre%20%2Ekeyword%20%7B%0Acolor%3A%20%23900%3B%0Afont%2Dweight%3A%20bold%0A%7D%0Apre%20%2Eidentifier%20%7B%0Acolor%3A%20rgb%280%2C%200%2C%200%29%3B%0A%7D%0Apre%20%2Estri [...]
+<script src="data:application/x-javascript;base64,dmFyIGhsanM9bmV3IGZ1bmN0aW9uKCl7ZnVuY3Rpb24gbShwKXtyZXR1cm4gcC5yZXBsYWNlKC8mL2dtLCImYW1wOyIpLnJlcGxhY2UoLzwvZ20sIiZsdDsiKX1mdW5jdGlvbiBmKHIscSxwKXtyZXR1cm4gUmVnRXhwKHEsIm0iKyhyLmNJPyJpIjoiIikrKHA/ImciOiIiKSl9ZnVuY3Rpb24gYihyKXtmb3IodmFyIHA9MDtwPHIuY2hpbGROb2Rlcy5sZW5ndGg7cCsrKXt2YXIgcT1yLmNoaWxkTm9kZXNbcF07aWYocS5ub2RlTmFtZT09IkNPREUiKXtyZXR1cm4gcX1pZighKHEubm9kZVR5cGU9PTMmJnEubm9kZVZhbHVlLm1hdGNoKC9ccysvKSkpe2JyZWFrfX19ZnVuY3Rpb24gaCh0LH [...]
+
+<style type="text/css">code{white-space: pre;}</style>
+<style type="text/css">
+ pre:not([class]) {
+ background-color: white;
+ }
+</style>
+<script type="text/javascript">
+if (window.hljs && document.readyState && document.readyState === "complete") {
+ window.setTimeout(function() {
+ hljs.initHighlighting();
+ }, 0);
+}
+</script>
+
+
+<link href="data:text/css;charset=utf-8,body%2C%20td%20%7B%0Afont%2Dfamily%3A%20sans%2Dserif%3B%0Abackground%2Dcolor%3A%20white%3B%0Afont%2Dsize%3A%2013px%3B%0A%7D%0Abody%20%7B%0Amax%2Dwidth%3A%20800px%3B%0Amargin%3A%200%20auto%3B%0Apadding%3A%201em%201em%202em%3B%0Aline%2Dheight%3A%2020px%3B%0A%7D%0A%0Adiv%23TOC%20li%20%7B%0Alist%2Dstyle%3Anone%3B%0Abackground%2Dimage%3Anone%3B%0Abackground%2Drepeat%3Anone%3B%0Abackground%2Dposition%3A0%3B%0A%7D%0A%0Ap%2C%20pre%20%7B%20margin%3A%200em%2 [...]
+
+<script type="text/javascript">
+document.addEventListener("DOMContentLoaded", function() {
+ var links = document.links;
+ for (var i = 0, linksLength = links.length; i < linksLength; i++)
+ if(links[i].hostname != window.location.hostname)
+ links[i].target = '_blank';
+});
+</script>
+
+</head>
+
+<body>
+
+
+<div id="header">
+<h1 class="title">phyloseq Frequently Asked Questions (FAQ)</h1>
+<h4 class="author"><em>Paul McMurdie and Susan Holmes</em></h4>
+<h4 class="date"><em>Thu Dec 29 19:03:27 2016</em></h4>
+</div>
+
+<h1>Contents</h1>
+<div id="TOC">
+<ul>
+<li><a href="#i-tried-reading-my-biom-file-using-phyloseq-but-it-didnt-work.-whats-wrong"><span class="toc-section-number">1</span> - I tried reading my biom file using phyloseq, but it didn’t work. What’s wrong?</a><ul>
+<li><a href="#good-news-hdf5-biom-should-be-supported-in-next-release"><span class="toc-section-number">1.1</span> Good News: HDF5-biom should be supported in next release</a></li>
+<li><a href="#hdf5-version-2.0-biom-format-biomformat"><span class="toc-section-number">1.2</span> HDF5 (Version 2.0) biom-format: <em>biomformat</em></a></li>
+<li><a href="#not-every-data-component-is-included-in-.biom-files"><span class="toc-section-number">1.3</span> Not every data component is included in .biom files</a></li>
+<li><a href="#other-issues-related-the-biom-format"><span class="toc-section-number">1.4</span> Other issues related the biom-format</a></li>
+</ul></li>
+<li><a href="#microbio_me_qiime-returned-an-error.-whats-wrong"><span class="toc-section-number">2</span> - <code>microbio_me_qiime()</code> returned an error. What’s wrong?</a><ul>
+<li><a href="#the-qiime-db-server-is-permanently-down."><span class="toc-section-number">2.1</span> The QIIME-DB Server is Permanently Down.</a></li>
+<li><a href="#an-interface-to-qiita-is-planned."><span class="toc-section-number">2.2</span> An interface to Qiita is Planned.</a></li>
+</ul></li>
+<li><a href="#i-want-a-phyloseq-graphic-that-looks-like"><span class="toc-section-number">3</span> - I want a phyloseq graphic that looks like…</a><ul>
+<li><a href="#modify-the-ggplot-object-yourself."><span class="toc-section-number">3.1</span> Modify the ggplot object yourself.</a></li>
+<li><a href="#psmelt-and-ggplot2"><span class="toc-section-number">3.2</span> psmelt and ggplot2</a></li>
+<li><a href="#submit-a-pull-request-advanced"><span class="toc-section-number">3.3</span> Submit a Pull Request (Advanced)</a></li>
+<li><a href="#define-a-ggplot2-extension-advanced"><span class="toc-section-number">3.4</span> Define a ggplot2 extension (Advanced)</a></li>
+</ul></li>
+<li><a href="#theres-a-typo-in-phyloseq-documentation-tutorials-or-vignettes"><span class="toc-section-number">4</span> - There’s a typo in phyloseq documentation, tutorials, or vignettes</a><ul>
+<li><a href="#fix-the-typo-directly-on-github"><span class="toc-section-number">4.1</span> Fix the typo directly on GitHub</a></li>
+<li><a href="#minimal-git-and-github-exercise"><span class="toc-section-number">4.2</span> Minimal GIT and GitHub Exercise</a></li>
+</ul></li>
+<li><a href="#i-read-waste-not-want-not-but"><span class="toc-section-number">5</span> - I read <a href="http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1003531">“Waste Not, Want Not…”</a> but…</a><ul>
+<li><a href="#i-tried-to-use-deseq2-to-normalize-my-data-but-now-i-dont-know-what-to-do"><span class="toc-section-number">5.1</span> I tried to <a href="http://joey711.github.io/phyloseq-extensions/DESeq2.html">use DESeq2</a> to normalize my data, but now I don’t know what to do…</a></li>
+<li><a href="#my-librariessamples-had-different-total-number-of-reads-what-do-i-do"><span class="toc-section-number">5.2</span> My libraries/samples had different total number of reads, what do I do?</a></li>
+<li><a href="#should-i-normalize-my-data-before-alpha-diversity-analysis"><span class="toc-section-number">5.3</span> Should I normalize my data before alpha-diversity analysis</a></li>
+<li><a href="#negative-numbers-in-my-transformed-data-table"><span class="toc-section-number">5.4</span> Negative numbers in my transformed data table?</a></li>
+<li><a href="#i-get-an-error-regarding-geometric-mean"><span class="toc-section-number">5.5</span> I get an error regarding geometric mean</a></li>
+<li><a href="#pseudocounts-are-not-appropriate-for-my-data-because"><span class="toc-section-number">5.6</span> Pseudocounts are not appropriate for my data, because…</a></li>
+<li><a href="#im-scared-that-the-negative-binomial-doesnt-fit-my-data-well"><span class="toc-section-number">5.7</span> I’m scared that the Negative Binomial doesn’t fit my data well</a></li>
+<li><a href="#i-dont-know-how-to-test-for-differential-abundance-now.-how-do-i-do-that"><span class="toc-section-number">5.8</span> I don’t know how to test for differential abundance now. How do I do that?</a></li>
+</ul></li>
+<li><a href="#i-need-help-analyzing-my-data.-it-has-the-following-study-design"><span class="toc-section-number">6</span> - I need help analyzing my data. It has the following study design…</a></li>
+</ul>
+</div>
+
+<p>This vignette includes answers and supporting materials that address <a href="https://en.wikipedia.org/wiki/FAQ">frequently asked questions (FAQs)</a>, especially those posted on <a href="https://github.com/joey711/phyloseq/issues">the phyloseq issues tracker</a>.</p>
+<p>For most issues <a href="https://github.com/joey711/phyloseq/issues">the phyloseq issues tracker</a> should suffice; but occasionally there are questions that are asked repeatedly enough that it becomes appropriate to canonize the answer here in this vignette. This is both (1) to help users find solutions more quickly, and (2) to mitigate redundancy on <a href="https://github.com/joey711/phyloseq/issues">the issues tracker</a>.</p>
+<p>All users are encouraged to perform a google search and review other questions/responses to both open and closed issues on <a href="https://github.com/joey711/phyloseq/issues">the phyloseq issues tracker</a> before seeking an active response by posting a new issue.</p>
+<pre class="r"><code>library("phyloseq"); packageVersion("phyloseq")</code></pre>
+<pre><code>## [1] '1.19.1'</code></pre>
+<pre class="r"><code>library("ggplot2"); packageVersion("ggplot2")</code></pre>
+<pre><code>## [1] '2.2.0'</code></pre>
+<pre class="r"><code>theme_set(theme_bw())</code></pre>
+<div id="i-tried-reading-my-biom-file-using-phyloseq-but-it-didnt-work.-whats-wrong" class="section level1">
+<h1><span class="header-section-number">1</span> - I tried reading my biom file using phyloseq, but it didn’t work. What’s wrong?</h1>
+<p>The most common cause for this errors is derived from a massive change to the way biom files are stored on disk. There are currently two “versions” of the biom-format, each of which stores data very differently. The original format – and original support in phyloseq – was for biom-format version 1 based on <a href="https://en.wikipedia.org/wiki/JSON">JSON</a>.</p>
+<p>The latest version – version 2 – is based on the <a href="https://www.hdfgroup.org/HDF5/doc/UG/index.html">HDF5</a> file format, and this new biom format version recently become the default file output format for popular workflows like QIIME.</p>
+<div id="good-news-hdf5-biom-should-be-supported-in-next-release" class="section level2">
+<h2><span class="header-section-number">1.1</span> Good News: HDF5-biom should be supported in next release</h2>
+<p>The <em>biomformat</em> package is the Bioconductor incarnation of R package support for the biom file format, written by Paul McMurdie (phyloseq author) and Joseph Paulson (metagenomeSeq author). Although it has been available on GitHub and BioC-devel for many months now, the first release version of <em>biomformat</em> on Bioconductor will be in April 2016. In that same release, phyloseq will switch over from the JSON-only <em>biom</em> package hosted on CRAN to this new package, <e [...]
+<p>This difference will be largely opaque to users, and phyloseq will “just work” after the next release in April.</p>
+<p>Use the <code>import_biom</code> function to read your recent QIIME or other biom-format data.</p>
+<p>Additional back details are described in <a href="https://github.com/joey711/phyloseq/issues/443">Issue 443</a>.</p>
+</div>
+<div id="hdf5-version-2.0-biom-format-biomformat" class="section level2">
+<h2><span class="header-section-number">1.2</span> HDF5 (Version 2.0) biom-format: <em>biomformat</em></h2>
+<p>As just described, HDF5 biom format is currently supported in the development version of phyloseq, via the new beta/development package called <em>biomformat</em> on BioC-devel and GitHub:</p>
+<p><a href="https://github.com/joey711/biomformat" class="uri">https://github.com/joey711/biomformat</a></p>
+<p>If you need to use HDF5-based biom format files <strong>immediately</strong> and cannot wait for the upcoming release, then you should install the development version of the <em>biomformat</em> package by following the instructions at the link above.</p>
+</div>
+<div id="not-every-data-component-is-included-in-.biom-files" class="section level2">
+<h2><span class="header-section-number">1.3</span> Not every data component is included in .biom files</h2>
+<p>Even though the biom-format supports the self-annotated inclusion of major components like that taxonomy table and sample data table, many tools that generate biom-format files (like QIIME, MG-RAST, mothur, etc.) do not export this data, even if you provided the information in your data input files. The reason for this boggles me, and I’ve shared my views on this with QIIME developers, but there nevertheless seems to be no plan to include your sample data in the ouput biom file.</p>
+<p>Furthermore, even though I have proposed it to the biom-format team, there is currently no support (or timeline for support) for inclusion of a phylogenetic tree within a “.biom” file.</p>
+<p>A number of tutorials are available demonstrating how one can add components to a phyloseq object after it has been created/imported. The following tutorial is especially relevant</p>
+<p><a href="http://joey711.github.io/phyloseq-demo/import-biom-sd-example.html" class="uri">http://joey711.github.io/phyloseq-demo/import-biom-sd-example.html</a></p>
+<p>Which makes use of the following functions:</p>
+<ul>
+<li><code>import_qiime_sample_data</code></li>
+<li><code>merge_phyloseq</code></li>
+</ul>
+</div>
+<div id="other-issues-related-the-biom-format" class="section level2">
+<h2><span class="header-section-number">1.4</span> Other issues related the biom-format</h2>
+<p>There are a number of different Issue Tracker posts discussing this format with respect to phyloseq:</p>
+<p><a href="https://github.com/joey711/phyloseq/issues/302" class="uri">https://github.com/joey711/phyloseq/issues/302</a></p>
+<p><a href="https://github.com/joey711/phyloseq/issues/272" class="uri">https://github.com/joey711/phyloseq/issues/272</a></p>
+<p><a href="https://github.com/joey711/phyloseq/issues/392" class="uri">https://github.com/joey711/phyloseq/issues/392</a></p>
+<p><a href="https://github.com/joey711/phyloseq/issues/443">Issue 443</a> has details for updated format.</p>
+</div>
+</div>
+<div id="microbio_me_qiime-returned-an-error.-whats-wrong" class="section level1">
+<h1><span class="header-section-number">2</span> - <code>microbio_me_qiime()</code> returned an error. What’s wrong?</h1>
+<div id="the-qiime-db-server-is-permanently-down." class="section level2">
+<h2><span class="header-section-number">2.1</span> The QIIME-DB Server is Permanently Down.</h2>
+<p>The QIIME-DB server is permanently down.</p>
+<p>Users are suggested to migrate their queries over to Qiita.</p>
+<p>Indeed, the previous link to <a href="http://www.microbio.me/qiime/index.psp">microbio.me/qiime</a> now sends users to the new Qiita website.</p>
+</div>
+<div id="an-interface-to-qiita-is-planned." class="section level2">
+<h2><span class="header-section-number">2.2</span> An interface to Qiita is Planned.</h2>
+<p>Stay tuned. The Qiita API needs to be released by the Qiita developers first. The phyloseq developers have no control over this, as we are not affiliated directly with the QIIME developers. Once there is an official Qiita API with documentation, an interface for phyloseq will be added.</p>
+<p>We found the <code>microbio_me_qiime()</code> function to be very convenient while the QIIME-DB server lasted. Hopefully an equivalent is hosted soon.</p>
+</div>
+</div>
+<div id="i-want-a-phyloseq-graphic-that-looks-like" class="section level1">
+<h1><span class="header-section-number">3</span> - I want a phyloseq graphic that looks like…</h1>
+<p>Great!</p>
+<p><strong>Every plot function in phyloseq returns a ggplot2 object</strong>. When these objects are “printed” to standard output in an R session, for instance,</p>
+<pre class="r"><code>data(esophagus)
+plot_tree(esophagus)</code></pre>
+<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAA8AAAAKgCAMAAABji+RHAAAC/VBMVEUAAAABAQECAgIDAwMEBAQFBQUHBwcICAgJCQkKCgoLCwsMDAwNDQ0ODg4PDw8QEBARERESEhITExMUFBQVFRUWFhYXFxcYGBgZGRkaGhobGxscHBwdHR0eHh4fHx8gICAhISEiIiIjIyMkJCQlJSUmJiYnJycoKCgpKSkqKiorKyssLCwtLS0uLi4vLy8wMDAxMTEyMjIzMzM0NDQ1NTU2NjY3Nzc4ODg5OTk6Ojo7Ozs8PDw9PT0+Pj4/Pz9AQEBBQUFCQkJDQ0NERERFRUVGRkZHR0dISEhJSUlKSkpLS0tMTExNTU1OTk5PT09QUFBRUVFSUlJTU1NUVFRVVVVWVlZXV1dYWFhZWVlaWlpbW1tcXFxdXV1eXl5fX19gYGBhYWFiYmJjY2NkZGRlZ [...]
+<p>then the graphic is rendered in <a href="https://stat.ethz.ch/R-manual/R-devel/library/grDevices/html/Devices.html">the current graphic device</a>.</p>
+<p>Alternatively, if you save the object output from a phyloseq <code>plot_</code> function as a variable in your session, then you can further modify it, interactively, at your leisure. For instance,</p>
+<pre class="r"><code>p1 = plot_tree(esophagus, color = "Sample")
+p1</code></pre>
+<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAA8AAAAKgCAIAAADbN4MiAAAACXBIWXMAAA7DAAAOwwHHb6hkAAAgAElEQVR4nOzdd2AUdf7/8fdsSe8dAggJhNACCKE3IdgQCweoKKiHiqKnchY4Fb/YsctPhAM5FeUsIIoKKkVEqUpvoXdCIAkEUknZnd8f4UKyu0lY2N3Z7D4ff82857OT17cILyazM4qqqgIAAADg0ui0DgAAAADUJxRoAAAAwA4UaAAAAMAOFGgAAADADhRoAAAAwA4UaAAAAMAOhtoPl5aWvvDCC66JAgAAAPc3bty42NhYrVNoqY4CbTKZRGTy5MkuCQMAAAD3tWHDhoceeujBBx/UOojGuIUDAAAAsAMFGgAAALADBRoAAACwAwUaAAAAsAMFGgAAALADBRoAAACwAwUaAAAAsAMFGgAAALADBRoAAACwAwUaAAAAs [...]
+<pre class="r"><code>p1 +
+ ggtitle("This is my title.") +
+ annotate("text", 0.25, 3,
+ color = "orange",
+ label = "my annotation")</code></pre>
+<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAA8AAAAKgCAIAAADbN4MiAAAACXBIWXMAAA7DAAAOwwHHb6hkAAAgAElEQVR4nOzdd3xT9f7H8c9Jmu5FJx1AF6VsKqXsXRFFEAFFRcFxr4go1yt4f15xoVcFFSfgdYPeex2ICgiOKkMQAdl7FgqU0tIFXXQk+f3RWto0LQSSnDR9Pf/Kd+T0fe9D9E16cr6K0WgUAAAAAJdHo3YAAAAAoCmhQAMAAAAWoEADAAAAFqBAAwAAABagQAMAAAAWoEADAAAAFrh0gb777ruVhr3xxhsi0q9fv9GjRzd0BX9//8jISEuTXdm77KORbElJSbfddpud8wAAAMBuXC65Y+TIkTVlcevWrT/88MNdd93VunXrqpnk5ORLXiEgIMDLy8vSZFf2LhtZs2bNhg0bnnjiiaph7WwmSwAAAHBuikUHqfz73/+eM [...]
+<p>There are lots of ways for you to generate custom graphics with phyloseq as a starting point.</p>
+<p>The following sections list some of my favorites.</p>
+<div id="modify-the-ggplot-object-yourself." class="section level2">
+<h2><span class="header-section-number">3.1</span> Modify the ggplot object yourself.</h2>
+<p>For example, <a href="http://joey711.github.io/phyloseq/plot_ordination-examples.html">the plot_ordination() examples tutorial</a> provides several examples of using additional ggplot2 commands to modify/customize the graphic encoded in the ggplot2 object returned by <code>plot_ordination</code>.</p>
+<p><a href="http://docs.ggplot2.org/current/">The ggplo2 documentation</a> is the current and canonical online reference for creating, modifying, and developing with ggplot2 objects.</p>
+<p>For simple changes to aesthetics and aesthetic mapping, <a href="http://docs.ggplot2.org/current/vignettes/ggplot2-specs.html">the aesthetic specifications vignette</a> is a useful resource.</p>
+</div>
+<div id="psmelt-and-ggplot2" class="section level2">
+<h2><span class="header-section-number">3.2</span> psmelt and ggplot2</h2>
+<p>The <code>psmelt</code> function converts your phyloseq object into a table (<code>data.frame</code>) that is very friendly for defining a custom ggplot2 graphic. This function was originally created as an internal (not user-exposed) tool within phyloseq to enable a <a href="https://en.wikipedia.org/wiki/Don%27t_repeat_yourself">DRY</a> approach to building ggplot2 graphics from microbiome data represented as phyloseq objects.</p>
+<p>When applicable, the phyloseq <code>plot_</code> family of functions use <code>psmelt</code>. This function is now a documented and user-accessible function in phyloseq – for the main purpose of enabling users to create their own ggplot2 graphics as needed.</p>
+<p>There are lots of great documentation examples for ggplot2 at</p>
+<ul>
+<li><a href="http://docs.ggplot2.org/current/">the ggplot2 official documentation site</a>,</li>
+<li><a href="http://stackoverflow.com/tags/ggplot2">ggplot2 on StackOverflow</a>, and</li>
+<li><a href="https://joey711.github.io/phyloseq/">phyloseq documentation pages</a>.</li>
+</ul>
+<p>The following are two very simple examples of using <code>psmelt</code> to define your own ggplot2 object “from scratch”. It should be evident that you could include further ggplot2 commands to modify each plot further, as you see fit.</p>
+<pre class="r"><code>data("esophagus")
+mdf = psmelt(esophagus)
+# Simple bar plot. See plot_bar() for more.
+ggplot(mdf, aes(x = Sample,
+ y = Abundance)) +
+ geom_bar(stat = "identity", position = "stack", color = "black")</code></pre>
+<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAA8AAAAKgCAMAAABji+RHAAAC91BMVEUAAAABAQECAgIDAwMEBAQFBQUGBgYHBwcICAgJCQkKCgoLCwsMDAwNDQ0ODg4PDw8QEBARERESEhITExMUFBQVFRUWFhYXFxcYGBgZGRkaGhobGxscHBwdHR0eHh4fHx8gICAhISEiIiIjIyMkJCQlJSUmJiYnJycoKCgpKSkqKiorKyssLCwtLS0uLi4vLy8wMDAxMTEyMjIzMzM0NDQ1NTU2NjY3Nzc4ODg5OTk6Ojo7Ozs8PDw9PT0+Pj4/Pz9AQEBBQUFCQkJDQ0NERERFRUVGRkZHR0dISEhJSUlKSkpLS0tMTExNTU1OTk5PT09QUFBRUVFSUlJTU1NUVFRVVVVWVlZXV1dYWFhZWVlaWlpbW1tcXFxdXV1eXl5fX19gYGBhYWFiYmJjY2NkZ [...]
+<pre class="r"><code># Simple heat map. See plot_heatmap() for more.
+ggplot(mdf, aes(x = Sample,
+ y = OTU,
+ fill = Abundance)) +
+ geom_raster()</code></pre>
+<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAA8AAAAKgCAIAAADbN4MiAAAACXBIWXMAAA7DAAAOwwHHb6hkAAAgAElEQVR4nOzde1hU173/8TUz3BUogjVHQEUo0WqlJtWGVI6aoxxzabSg9NE0UEVTJtFOjU8MxdonJqZ4IxMiGn4NBlEjRgNeYkIM1nhJS08i0VSOpsRLNFLRYqOiAsMM+/fHjlOOzowDe88MDO/Xwx971qy99l40j/26XXt9NJIkCQAAAADO0Xr6BgAAAIDuhAIaAAAA6AAKaAAAAKADKKABAACADqCABgAAADqAAhoAAADoAB9P34AHvPbaa//4xz88fRdwSltbW1tbm49PT/wPFd6H/57hTSRJslgsOp1Oo9F4+l7grPnz5/fr18/Td+ENeuKf46dOncrPz1c+zorXN537x0Xl48ABi8ViNpv9ff09fSPeT7KYPX0L3 [...]
+</div>
+<div id="submit-a-pull-request-advanced" class="section level2">
+<h2><span class="header-section-number">3.3</span> Submit a Pull Request (Advanced)</h2>
+<p>If your new custom plot function is awesome and you think others might use it, add it to the <code>"plot-methods.R"</code> source file and submit a pull request on GitHub.</p>
+<p><a href="https://help.github.com/articles/using-pull-requests/">GitHub Official Pull Request Documentation</a></p>
+<p>Please include example and test code in the code included in your pull request.</p>
+<p>I’ll try and add it to the package by the next release. I will also give you authorship credit in the function doc. See the “typo fix” section below for further details about GitHub pull requests…</p>
+</div>
+<div id="define-a-ggplot2-extension-advanced" class="section level2">
+<h2><span class="header-section-number">3.4</span> Define a ggplot2 extension (Advanced)</h2>
+<p>Development of new R functions/commands for creating/modifying new geometric objects is now formally documented in <a href="http://docs.ggplot2.org/current/vignettes/extending-ggplot2.html">the ggplot2 extension vignette</a>.</p>
+<p>This may be related to the previous section, in that your ggplot2 extension for phyloseq could be contributed to the phyloseq project as a pull request.</p>
+</div>
+</div>
+<div id="theres-a-typo-in-phyloseq-documentation-tutorials-or-vignettes" class="section level1">
+<h1><span class="header-section-number">4</span> - There’s a typo in phyloseq documentation, tutorials, or vignettes</h1>
+<p>This is something that is actually faster and less work for you to solve yourself and contribute back to the phyloseq package. For trivial typo fixes, I will quickly include your fixes into the package code. Sometimes I accept them on my cell phone while I’m still in bed. No wasted time on either end! :-)</p>
+<p>The point is that this should be simple, and is simple if you follow one of the following suggestions.</p>
+<div id="fix-the-typo-directly-on-github" class="section level2">
+<h2><span class="header-section-number">4.1</span> Fix the typo directly on GitHub</h2>
+<p>GitHub now provides the option to make changes to code/text of a repository directly from your web browser through an in-page editor. This handles all the Git details for you. If you have a GitHub account and you’re logged in, all you’d have to do is locate the file with the offending typo, then use the “edit” button to make the changes and send the to me as a pull request.</p>
+</div>
+<div id="minimal-git-and-github-exercise" class="section level2">
+<h2><span class="header-section-number">4.2</span> Minimal GIT and GitHub Exercise</h2>
+<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAccAAAEaCAIAAADrPgsUAAAAA3NCSVQICAjb4U/gAAAgAElEQVR4Xu2dXehdxfX3jb+fEbTR2jdqrC/B3rXxhQfKU5KgrVCllqr/pxVsL6LQi5pSonhjrjRX9kaMBRMoiEqphdYSpZVqKaiEFkpbooaivRBNqzEFaXwXg7HPx3z5T4eZvWfP7Jdz9jl7nYvDPrPXrFnznTXfveZlzznhP02fN998s0nkv/eLhF955ZWBNBeZUSRsNvtNVgRdkbDhbDjXkcP4fePEE+xjCBgChoAh0B8Cxqr9YWmaDAFDwBA44QRjVfMCQ8AQMAT6RGC1T2WddT311FPoePrpp19//XUunnzySafy2LFjKysrmSUMJ3z06NG1a9fO3YyiCs7G5ksvvRRYPv7xj1900UVcXHLJJZkomZghsGQIrL711lvpKjUK+ [...]
+<p>(The following instructions are borrowed from <a href="http://yihui.name/en/2013/06/fix-typo-in-documentation/">Yihui Xie’s site about fixing typos</a>)</p>
+<p>Alternatively, for those who want to try GIT and Github pull requests, which make it possible for you to contribute to open source and fix obvious problems with no questions being asked – just do it yourself, and send the changes to the original author(s) through Github.</p>
+<p>The official documentation for Github pull requests is a little bit verbose for beginners. Basically what you need to do for simple tasks are:</p>
+<ol style="list-style-type: decimal">
+<li>click the Fork button and clone the repository in your own account;</li>
+<li>make the changes in your cloned version;</li>
+<li>push to your repository;</li>
+<li>click the Pull Request button to send a request to the original author;</li>
+</ol>
+</div>
+</div>
+<div id="i-read-waste-not-want-not-but" class="section level1">
+<h1><span class="header-section-number">5</span> - I read <a href="http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1003531">“Waste Not, Want Not…”</a> but…</h1>
+<p>Before getting to more specific issues, let’s start by keeping appropriately separate the concept of</p>
+<ul>
+<li><ol style="list-style-type: decimal">
+<li>denoising amplicon sequences, and/or denoising features in the contingency table, and</li>
+</ol></li>
+<li><ol start="2" style="list-style-type: decimal">
+<li>standardization</li>
+</ol></li>
+</ul>
+<p>These two concepts have been often-conflated – mostly by purveyors of methods that use rarefying – wrongly insisting that rarefying is somehow addressing both problems and the matter is settled. Unfortunately rarefying is a very inefficient, noise-introducing method that poorly addresses the data analysis challenges that motivate either concept.</p>
+<p>DESeq2 and related solutions can help you address the need for standardization (e.g. differing library sizes) at a particular step in your analysis while still making efficient inferences from your data.</p>
+<p>The denoising problem is best addressed at the sequence-processing level, and the best general-purpose option currently available is:</p>
+<ul>
+<li><a href="http://benjjneb.github.io/dada2/">The dada2 algorithm</a>, if your data works well with it. Current support is mainly Illumina sequence data, or</li>
+<li><a href="http://drive5.com/uparse/">UPARSE</a> in the usearch package, if you don’t have sequencing data that works well with <a href="http://benjjneb.github.io/dada2/">dada2</a></li>
+</ul>
+<div id="i-tried-to-use-deseq2-to-normalize-my-data-but-now-i-dont-know-what-to-do" class="section level2">
+<h2><span class="header-section-number">5.1</span> I tried to <a href="http://joey711.github.io/phyloseq-extensions/DESeq2.html">use DESeq2</a> to normalize my data, but now I don’t know what to do…</h2>
+<p>The answer to a question of this category depends a lot on your experiment, and what you want to learn from your data. The following are some resources that may help.</p>
+<ul>
+<li><a href="http://joey711.github.io/waste-not-supplemental/">Waste Not, Want Not Supplemental Materials</a></li>
+<li><a href="https://www.bioconductor.org/packages/release/bioc/vignettes/phyloseq/inst/doc/phyloseq-mixture-models.html">Differential Abundance Vignette</a></li>
+<li><a href="https://joey711.github.io/phyloseq/">The phyloseq front page</a></li>
+</ul>
+</div>
+<div id="my-librariessamples-had-different-total-number-of-reads-what-do-i-do" class="section level2">
+<h2><span class="header-section-number">5.2</span> My libraries/samples had different total number of reads, what do I do?</h2>
+<p>That is an expected artifact of current sequencing technologies, and not a “problem” on its own. In most cases, differences in total counts are uncorrelated with any variable in your experimental design. <strong>You should check that this is the case</strong>. It remains possible that there are structural/procedural artifacts in your experiment that have influenced the total counts. If library sizes are correlated with one of your design variables, then this <em>might</em> represent a [...]
+<p>Other than the portent of structural biases in your experiment, you should recall that comparisons between observation classes that have <strong>uneven sample sizes is not a new nor unsolved problem in statistics</strong>.</p>
+<p>The most useful analytical methods you can use in this context are therefore methods that expect and account for differences in total number of reads between samples.</p>
+<p>How you account for these <em>library size</em> differences should depend on the type of analysis in which you are engaged, and which methods you plan to use. For instance, for a beta-diversity measure like Bray-Curtis Dissimilarity, you might simply use the relative abundance of each taxa in each sample, as the absolute counts are not appropriate to use directly in the context where count differences are not meaningful.</p>
+<p>For further information, see</p>
+<ul>
+<li><a href="http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1003531">“Waste Not, Want Not…”</a></li>
+<li><a href="https://github.com/joey711/phyloseq/issues/229">Discussion for Issue 229</a></li>
+<li><a href="https://github.com/joey711/phyloseq/issues/299">Discussion for Issue 299</a></li>
+</ul>
+</div>
+<div id="should-i-normalize-my-data-before-alpha-diversity-analysis" class="section level2">
+<h2><span class="header-section-number">5.3</span> Should I normalize my data before alpha-diversity analysis</h2>
+<p><strong>No.</strong> Generally speaking, the answer is <strong>no</strong>. Most alpha diversity methods will be most effective when provided with the originally-observed count values.</p>
+<p>The misleading notion – that normalization is necessary prior to alpha-diversity analysis – seems to be derived from various “one size fits all” pipeline tools like QIIME, in which it is often encouraged to <a href="http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1003531"><em>rarefy</em></a> counts as a normalizing transformation prior to any/all analysis. While this may simplify certain aspects of pipeline software development, it is analytical and statistical f [...]
+<p>For further information, I suggest reviewing literature such as</p>
+<ul>
+<li><a href="http://onlinelibrary.wiley.com/doi/10.1046/j.1461-0248.2001.00230.x/abstract;jsessionid=A5EF264ABB5EADD5CCE9EF3AEE50CA41.f01t03">Gotelli Colwell (2001)</a>, and of course,</li>
+<li><a href="http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1003531">“Waste Not, Want Not…”</a></li>
+</ul>
+</div>
+<div id="negative-numbers-in-my-transformed-data-table" class="section level2">
+<h2><span class="header-section-number">5.4</span> Negative numbers in my transformed data table?</h2>
+<p>This sort of question usually appears after someone used a log-like transformation / variance stabilizing transformation on their data, in preparation for an exploratory analysis via ordination. Negative values in this context probably correspond to <strong>“less than one count”</strong> after rescaling. For many ordination methods, like <a href="https://en.wikipedia.org/wiki/Principal_component_analysis">PCA</a>, negative numbers are not a problem.</p>
+<p>Instead, the problem is often posed because a user also wants to use <strong>a particular distance measure</strong> that is undefined or unstable in the presence of negative entries. In this context, however, the more negative a value is, the more likely that it was zero, or very small, in the original “raw” count matrix. For most distances and hypotheses, these values are probably not very important, or even negligible. Given this, it is probably quite reasonable to do one of the fol [...]
+<ol style="list-style-type: decimal">
+<li>Set to zero all values less than zero. If <code>X</code> is your matrix, you can accomplish this with <code>X[X < 0.0] <- 0.0</code></li>
+<li>Add a pseudocount prior to data transformation. This often curbs or prevents the presence of zeroes in the table of transformed values. Some people don’t like this approach for their dataset, and they may or may not be correct. It is up to you to decide for your data. See <a href="https://github.com/joey711/phyloseq/issues/445">Discussion on Issue 445</a>.</li>
+</ol>
+<p>Please also note that taxa entries that are all negative after transformation, or equivalently are very small or almost always zero, should probably be filtered from your data prior to analysis. There are many different reasons for this.</p>
+</div>
+<div id="i-get-an-error-regarding-geometric-mean" class="section level2">
+<h2><span class="header-section-number">5.5</span> I get an error regarding geometric mean</h2>
+<p>See my <a href="http://stackoverflow.com/a/25555105/935950">SO post on alternative geometric mean functions in R</a> There are several examples for alternative calculations of geometric mean, and some of these might solve the problem of having an error.</p>
+<p>See also the discussion on <a href="https://github.com/joey711/phyloseq/issues/445">Issue 445</a> regarding geometric means.</p>
+<p>Alternative library size estimators may be appropriate for your data, and it remains your responsibility to determine if any specific approach is valid.</p>
+<p>Mike Love (a developer for DESeq2), suggested the following consideration:</p>
+<p>“On the other hand, very sparse count datasets, with large counts for single samples per row and the rest at 0, don’t fit well to the negative binomial distribution. Here, the VST or simply shifted log, <code>log(count+k)</code>, might be a safer choice than the <code>rlog</code>. A way that I test for sparsity is looking at a plot of the row sum of counts and the proportion of count which is in a single sample.”</p>
+</div>
+<div id="pseudocounts-are-not-appropriate-for-my-data-because" class="section level2">
+<h2><span class="header-section-number">5.6</span> Pseudocounts are not appropriate for my data, because…</h2>
+<p>See <a href="https://github.com/joey711/phyloseq/issues/445">Discussion on Issue 445</a>.</p>
+<p>Also, think carefully about what you mean here. I suspect this statement could be more accurately stated as, <em>pseudocounts are not appropriate for my experiment, data, and the analysis step I was about to perform</em>. Your position in this case is thus based on a combination of how the data appears to behave, and your knowledge of how pseudocounts would affect the analysis you were going to use. Consider the following.</p>
+<ul>
+<li>Is there an alternative analysis method?</li>
+<li>Is the method you were about to use really that sensitive to adding a pseucocount?</li>
+<li>Is a pseudocount really needed, or were you copying/pasting this step to an analysis script that you found somewhere?</li>
+</ul>
+</div>
+<div id="im-scared-that-the-negative-binomial-doesnt-fit-my-data-well" class="section level2">
+<h2><span class="header-section-number">5.7</span> I’m scared that the Negative Binomial doesn’t fit my data well</h2>
+<p>See <a href="https://github.com/joey711/phyloseq/issues/445">Discussion on Issue 445</a>.</p>
+</div>
+<div id="i-dont-know-how-to-test-for-differential-abundance-now.-how-do-i-do-that" class="section level2">
+<h2><span class="header-section-number">5.8</span> I don’t know how to test for differential abundance now. How do I do that?</h2>
+<p>There is now lots of documentation on this topic.</p>
+<p>For starters, please see <a href="http://bioconductor.org/packages/release/bioc/vignettes/phyloseq/inst/doc/phyloseq-mixture-models.html">the phyloseq vignette devoted to this topic</a>.</p>
+<p>A Google search for “phyloseq differential abundance” will also likely turn up a number of useful, related resources.</p>
+</div>
+</div>
+<div id="i-need-help-analyzing-my-data.-it-has-the-following-study-design" class="section level1">
+<h1><span class="header-section-number">6</span> - I need help analyzing my data. It has the following study design…</h1>
+<p>I am currently a biostatistician at Second Genome, Inc., which offers complete <a href="http://www.secondgenome.com/solutions">end-to-end microbiome experiment solutions</a> as a fee-for-service. In some cases Second Genome clients already have their microbiome data and want to make use of our team of trained microbiome analysts to get the most information from their expeirment. I recommend contacting one of the sales associates at the link above.</p>
+<p>My day-to-day efforts are in understanding the role of the microbiome in human health and disease. If you’re looking for a collaboration on your microbiome data collection or data analysis, please contact <a href="http://www.secondgenome.com/solutions">Second Genome Solutions</a>.</p>
+</div>
+
+
+
+<!-- dynamically load mathjax for compatibility with self-contained -->
+<script>
+ (function () {
+ var script = document.createElement("script");
+ script.type = "text/javascript";
+ script.src = "https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
+ document.getElementsByTagName("head")[0].appendChild(script);
+ })();
+</script>
+
+</body>
+</html>
diff --git a/inst/doc/phyloseq-analysis.R b/inst/doc/phyloseq-analysis.R
new file mode 100644
index 0000000..3398a38
--- /dev/null
+++ b/inst/doc/phyloseq-analysis.R
@@ -0,0 +1,208 @@
+## ----dontrun-basics-vignette, eval=FALSE---------------------------------
+# vignette("phyloseq-basics")
+
+## ----load-packages, message=FALSE, warning=FALSE-------------------------
+library("phyloseq")
+library("ggplot2")
+
+## ----ggplot2-themes------------------------------------------------------
+theme_set(theme_bw())
+
+## ------------------------------------------------------------------------
+data(GlobalPatterns)
+
+## ------------------------------------------------------------------------
+# prune OTUs that are not present in at least one sample
+GP <- prune_taxa(taxa_sums(GlobalPatterns) > 0, GlobalPatterns)
+# Define a human-associated versus non-human categorical variable:
+human <- get_variable(GP, "SampleType") %in% c("Feces", "Mock", "Skin", "Tongue")
+# Add new human variable to sample data:
+sample_data(GP)$human <- factor(human)
+
+## ----richness_estimates0, fig.width=13, fig.height=7---------------------
+alpha_meas = c("Observed", "Chao1", "ACE", "Shannon", "Simpson", "InvSimpson")
+(p <- plot_richness(GP, "human", "SampleType", measures=alpha_meas))
+
+## ----richness_estimates, fig.width=13,height=7---------------------------
+p + geom_boxplot(data=p$data, aes(x=human, y=value, color=NULL), alpha=0.1)
+
+## ------------------------------------------------------------------------
+GP.chl <- subset_taxa(GP, Phylum=="Chlamydiae")
+
+## ----GP-chl-tree, fig.width=15, fig.height=7, message=FALSE, warning=FALSE----
+plot_tree(GP.chl, color="SampleType", shape="Family", label.tips="Genus", size="Abundance")
+
+## ------------------------------------------------------------------------
+data(enterotype)
+
+## ----EntAbundPlot, fig.height=6, fig.width=8-----------------------------
+par(mar = c(10, 4, 4, 2) + 0.1) # make more room on bottom margin
+N <- 30
+barplot(sort(taxa_sums(enterotype), TRUE)[1:N]/nsamples(enterotype), las=2)
+
+## ------------------------------------------------------------------------
+rank_names(enterotype)
+
+## ------------------------------------------------------------------------
+TopNOTUs <- names(sort(taxa_sums(enterotype), TRUE)[1:10])
+ent10 <- prune_taxa(TopNOTUs, enterotype)
+print(ent10)
+
+## ------------------------------------------------------------------------
+sample_variables(ent10)
+
+## ----entbarplot0, fig.height=6, fig.width=10-----------------------------
+plot_bar(ent10, "SeqTech", fill="Enterotype", facet_grid=~Genus)
+
+## ----GPheatmap-----------------------------------------------------------
+data("GlobalPatterns")
+gpac <- subset_taxa(GlobalPatterns, Phylum=="Crenarchaeota")
+(p <- plot_heatmap(gpac, "NMDS", "bray", "SampleType", "Family"))
+
+## ----GPheatmap-rename-axes-----------------------------------------------
+p$scales$scales[[1]]$name <- "My X-Axis"
+p$scales$scales[[2]]$name <- "My Y-Axis"
+print(p)
+
+## ----plot_sample_network, fig.width=11, fig.height=7, message=FALSE, warning=FALSE----
+data(enterotype)
+plot_net(enterotype, maxdist=0.4, color="SeqTech", shape="Enterotype")
+
+## ----eval=FALSE----------------------------------------------------------
+# my.physeq <- import("Biom", BIOMfilename="myBiomFile.biom")
+# my.ord <- ordinate(my.physeq)
+# plot_ordination(my.physeq, my.ord, color="myFavoriteVarible")
+
+## ----help-import, eval=FALSE---------------------------------------------
+# help(import)
+# help(ordinate)
+# help(distance)
+# help(plot_ordination)
+
+## ----GP-data-load--------------------------------------------------------
+data(GlobalPatterns)
+
+## ---- eval=FALSE---------------------------------------------------------
+# GPUF <- UniFrac(GlobalPatterns)
+
+## ----load-precomputed-UF-------------------------------------------------
+load(system.file("doc", "Unweighted_UniFrac.RData", package="phyloseq"))
+
+## ------------------------------------------------------------------------
+GloPa.pcoa = ordinate(GlobalPatterns, method="PCoA", distance=GPUF)
+
+## ----PCoAScree, fig.width=6, fig.height=4--------------------------------
+plot_scree(GloPa.pcoa, "Scree plot for Global Patterns, UniFrac/PCoA")
+
+## ----GPfig5ax1213--------------------------------------------------------
+(p12 <- plot_ordination(GlobalPatterns, GloPa.pcoa, "samples", color="SampleType") +
+ geom_point(size=5) + geom_path() + scale_colour_hue(guide = FALSE) )
+(p13 <- plot_ordination(GlobalPatterns, GloPa.pcoa, "samples", axes=c(1, 3),
+ color="SampleType") + geom_line() + geom_point(size=5) )
+
+## ----GP_UF_NMDS0---------------------------------------------------------
+# (Re)load UniFrac distance matrix and GlobalPatterns data
+data(GlobalPatterns)
+load(system.file("doc", "Unweighted_UniFrac.RData", package="phyloseq"))
+# perform NMDS, set to 2 axes
+GP.NMDS <- ordinate(GlobalPatterns, "NMDS", GPUF)
+(p <- plot_ordination(GlobalPatterns, GP.NMDS, "samples", color="SampleType") +
+ geom_line() + geom_point(size=5) )
+
+## ----GPCAscree0, fig=FALSE-----------------------------------------------
+data(GlobalPatterns)
+# Take a subset of the GP dataset, top 200 species
+topsp <- names(sort(taxa_sums(GlobalPatterns), TRUE)[1:200])
+GP <- prune_taxa(topsp, GlobalPatterns)
+# Subset further to top 5 phyla, among the top 200 OTUs.
+top5ph <- sort(tapply(taxa_sums(GP), tax_table(GP)[, "Phylum"], sum), decreasing=TRUE)[1:5]
+GP <- subset_taxa(GP, Phylum %in% names(top5ph))
+# Re-add human variable to sample data:
+sample_data(GP)$human <- factor(human)
+
+## ----GPCAscree, fig.width=8, fig.height=5--------------------------------
+# Now perform a unconstrained correspondence analysis
+gpca <- ordinate(GP, "CCA")
+# Scree plot
+plot_scree(gpca, "Scree Plot for Global Patterns Correspondence Analysis")
+
+## ----GPCA1234------------------------------------------------------------
+(p12 <- plot_ordination(GP, gpca, "samples", color="SampleType") +
+ geom_line() + geom_point(size=5) )
+(p34 <- plot_ordination(GP, gpca, "samples", axes=c(3, 4), color="SampleType") +
+ geom_line() + geom_point(size=5) )
+
+## ----GPCAspecplot0-------------------------------------------------------
+p1 <- plot_ordination(GP, gpca, "species", color="Phylum")
+(p1 <- ggplot(p1$data, p1$mapping) + geom_point(size=5, alpha=0.5) +
+ facet_wrap(~Phylum) + scale_colour_hue(guide = FALSE) )
+
+## ----GPCAspecplotTopo0---------------------------------------------------
+(p3 <- ggplot(p1$data, p1$mapping) + geom_density2d() +
+ facet_wrap(~Phylum) + scale_colour_hue(guide = FALSE) )
+
+## ----GPCAjitter0---------------------------------------------------------
+library("reshape2")
+# Melt the species-data.frame, DF, to facet each CA axis separately
+mdf <- melt(p1$data[, c("CA1", "CA2", "Phylum", "Family", "Genus")],
+ id=c("Phylum", "Family", "Genus") )
+# Select some special outliers for labelling
+LF <- subset(mdf, variable=="CA2" & value < -1.0)
+# build plot: boxplot summaries of each CA-axis, with labels
+p <- ggplot(mdf, aes(Phylum, value, color=Phylum)) +
+ geom_boxplot() +
+ facet_wrap(~variable, 2) +
+ scale_colour_hue(guide = FALSE) +
+ theme_bw() +
+ theme( axis.text.x = element_text(angle = -90, vjust = 0.5) )
+# Add the text label layer, and render ggplot graphic
+(p <- p + geom_text(data=subset(LF, !is.na(Family)),
+ mapping = aes(Phylum, value+0.1, color=Phylum, label=Family),
+ vjust=0,
+ size=2))
+
+## ----GPtaxaplot0---------------------------------------------------------
+plot_bar(GP, x="human", fill="SampleType", facet_grid= ~ Phylum)
+
+## ----GPdpcoa01-----------------------------------------------------------
+GP.dpcoa <- ordinate(GP, "DPCoA")
+pdpcoa <- plot_ordination(GP, GP.dpcoa, type="biplot",
+ color="SampleType", shape="Phylum")
+shape.fac <- pdpcoa$data[, deparse(pdpcoa$mapping$shape)]
+man.shapes <- c(19, 21:25)
+names(man.shapes) <- c("Samples", levels(shape.fac)[levels(shape.fac)!="Samples"])
+p2dpcoa <- pdpcoa + scale_shape_manual(values=man.shapes)
+
+## ----GPdpcoa02-----------------------------------------------------------
+# Show just Samples or just Taxa
+plot_ordination(GP, GP.dpcoa, type="taxa", shape="Phylum")
+plot_ordination(GP, GP.dpcoa, type="samples", color="SampleType")
+# Split
+plot_ordination(GP, GP.dpcoa, type="split",
+ color="SampleType", shape="Phylum") +
+ ggplot2::scale_colour_discrete()
+
+## ----distancefun---------------------------------------------------------
+data(esophagus)
+distance(esophagus, "bray")
+distance(esophagus, "wunifrac") # weighted UniFrac
+distance(esophagus, "jaccard") # vegdist jaccard
+distance(esophagus, "g") # betadiver method option "g"
+
+## ----eval=FALSE, echo=TRUE-----------------------------------------------
+# data(esophagus)
+# distance(esophagus, "wUniFrac")
+# distance(esophagus, "uUniFrac")
+
+## ------------------------------------------------------------------------
+# (Re)load UniFrac distance matrix and GlobalPatterns data
+data(GlobalPatterns)
+load(system.file("doc", "Unweighted_UniFrac.RData", package="phyloseq"))
+# Manually define color-shading vector based on sample type.
+colorScale <- rainbow(length(levels(get_variable(GlobalPatterns, "SampleType"))))
+cols <- colorScale[get_variable(GlobalPatterns, "SampleType")]
+GP.tip.labels <- as(get_variable(GlobalPatterns, "SampleType"), "character")
+# This is the actual hierarchical clustering call, specifying average-link clustering
+GP.hclust <- hclust(GPUF, method="average")
+plot(GP.hclust, col=cols)
+
diff --git a/inst/doc/phyloseq-analysis.Rmd b/inst/doc/phyloseq-analysis.Rmd
new file mode 100644
index 0000000..345cf08
--- /dev/null
+++ b/inst/doc/phyloseq-analysis.Rmd
@@ -0,0 +1,506 @@
+---
+title: "Vignette for phyloseq: Analysis of high-throughput microbiome census data"
+output:
+ BiocStyle::html_document:
+ fig_height: 7
+ fig_width: 10
+ toc: yes
+ toc_depth: 2
+ number_sections: true
+---
+<!--
+%% \VignetteEngine{knitr::rmarkdown}
+%% \VignetteIndexEntry{analysis vignette}
+-->
+
+`r library("knitr")`
+`r opts_chunk$set(cache=FALSE, fig.width=9, message=FALSE, warning=FALSE)`
+
+Paul J. McMurdie and Susan Holmes
+
+<mcmurdie at stanford.edu>
+
+[phyloseq Home Page](http://joey711.github.io/phyloseq/)
+
+If you find phyloseq and/or its tutorials useful, please acknowledge and cite phyloseq in your publications:
+
+**phyloseq: An R package for reproducible interactive analysis and graphics of microbiome census data** (2013) PLoS ONE 8(4):e61217
+http://dx.plos.org/10.1371/journal.pone.0061217
+
+# Other resources
+The phyloseq project also has a number of supporting online resources, most of which can by found at [the phyloseq home page](http://joey711.github.com/phyloseq/), or from the phyloseq stable release [page on Bioconductor](http://bioconductor.org/packages/release/bioc/html/phyloseq.html).
+
+To post feature requests or ask for help, try [the phyloseq Issue Tracker](https://github.com/joey711/phyloseq/issues).
+
+
+# Summary
+
+The analysis of microbiological communities brings many challenges: the integration of many different types of data with methods from ecology, genetics, phylogenetics, network analysis, visualization and testing. The data itself may originate from widely different sources, such as the microbiomes of humans, soils, surface and ocean waters, wastewater treatment plants, industrial facilities, and so on; and as a result, these varied sample types may have very different forms and scales of [...]
+
+
+# About this vignette
+
+A separate vignette is included within the phyloseq-package that describes the basics of importing pre-clustered phylogenetic sequencing data, data filtering, as well as some transformations and some additional details about the package and installation. A quick way to load it is:
+
+```{r dontrun-basics-vignette, eval=FALSE}
+vignette("phyloseq-basics")
+```
+
+By contrast, this vignette is intended to provide functional examples of the analysis tools and wrappers included in phyloseq. All necessary code for performing the analysis and producing graphics will be included with its description, and the focus will be on the use of example data that is included and documented within the phyloseq-package.
+
+Let's start by loading the `phyloseq-package:
+
+```{r load-packages, message=FALSE, warning=FALSE}
+library("phyloseq")
+library("ggplot2")
+```
+
+And because we will show examples of custom modifications to ggplot2 plots,
+we also loaded ggplot2 as well.
+Here I'll set as default my favorite ggplot2 theme.
+These are completely optional, and modifiable.
+
+```{r ggplot2-themes}
+theme_set(theme_bw())
+```
+
+
+
+# Data
+
+## Interface with the microbio.me/qiime server
+See the [microbio_me_qiime tutorial](http://joey711.github.io/phyloseq/download-microbio.me.html) for more details and examples downloading and importing into phyloseq/R directly from this public database.
+
+## Included Data
+To facilitate testing and exploration of tools in phyloseq, this package includes example data from published studies. Many of the examples in this vignette use either the [Global Patterns](http://www.pnas.org/content/early/2010/06/02/1000080107) or `enterotype` datasets as source data. The [Global Patterns](http://www.pnas.org/content/early/2010/06/02/1000080107) data was described in a [2011 article in PNAS](http://www.pnas.org/content/early/2010/06/02/1000080107)([Caporaso 2011](http: [...]
+
+Because this data is included in the package, the examples can easily be run on your own computer using the code shown in this vignette. The data is loaded into memory using the `data` command. Let's start by loading the [Global Patterns](http://www.pnas.org/content/early/2010/06/02/1000080107) data.
+
+```{r}
+data(GlobalPatterns)
+```
+
+Later on we will use an additional categorical designation --- human versus non-human associated samples --- that was not in the original dataset. Now is a good time to add it as an explicit variable of the `sample_data`, and because we don't want to type long words over and over, we'll choose a shorter name for this modified version of [Global Patterns](http://www.pnas.org/content/early/2010/06/02/1000080107), call it `GP`, and also remove a handful of taxa that are not present in any o [...]
+
+```{r }
+# prune OTUs that are not present in at least one sample
+GP <- prune_taxa(taxa_sums(GlobalPatterns) > 0, GlobalPatterns)
+# Define a human-associated versus non-human categorical variable:
+human <- get_variable(GP, "SampleType") %in% c("Feces", "Mock", "Skin", "Tongue")
+# Add new human variable to sample data:
+sample_data(GP)$human <- factor(human)
+```
+
+
+# Simple exploratory graphics
+
+## Easy Richness Estimates
+
+For further details, see the [plot_richness tutorial](http://joey711.github.io/phyloseq/plot_richness-examples.html)
+
+We can easily create a complex graphic that compares the richness estimates of samples from different environment types in the [Global Patterns](http://www.pnas.org/content/early/2010/06/02/1000080107) dataset, using the `plot_richness` function. Note that it is important to use raw (untrimmed) OTU-clustered data when performing richness estimates, as they can be highly dependent on the number of singletons in a sample.
+
+```{r richness_estimates0, fig.width=13, fig.height=7}
+alpha_meas = c("Observed", "Chao1", "ACE", "Shannon", "Simpson", "InvSimpson")
+(p <- plot_richness(GP, "human", "SampleType", measures=alpha_meas))
+```
+
+Add a ggplot2 box plot layer to the previous plot
+
+```{r richness_estimates, fig.width=13,height=7}
+p + geom_boxplot(data=p$data, aes(x=human, y=value, color=NULL), alpha=0.1)
+```
+Alpha diversity estimators for samples in the *Global Patterns* dataset. Each panel shows a different type of estimator. Individual color-shaded points and brackets represent the richness estimate and the theoretical standard error range associated with that estimate, respectively. The colors group the sample-sources into "types". Within each panel, the samples are further organized into human-associated (`TRUE`) or not (`FALSE`), and a boxplot is overlayed on top of this for the two gro [...]
+
+
+## Exploratory tree plots
+
+For further details, see the [plot_tree tutorial](http://joey711.github.io/phyloseq/plot_tree-examples.html)
+
+phyloseq also contains a method for easily plotting an annotated phylogenetic tree with information regarding the sample in which a particular taxa was observed, and optionally the number of individuals that were observed.
+
+For the sake of creating a readable tree, let's subset the data to just the [Chlamydiae](http://en.wikipedia.org/wiki/Chlamydiae) phylum, which consists of obligate intracellular pathogens and is present in only a subset of environments in this dataset.
+
+```{r }
+GP.chl <- subset_taxa(GP, Phylum=="Chlamydiae")
+```
+
+And now we will create the tree graphic form this subset of [Global Patterns](http://www.pnas.org/content/early/2010/06/02/1000080107), shading by the "`SampleType" variable, which indicates the environment category from which the microbiome samples originated. The following command also takes the option of labeling the number of individuals observed in each sample (if at all) of each taxa. The symbols are slightly enlarged as the number of individuals increases.
+
+```{r GP-chl-tree, fig.width=15, fig.height=7, message=FALSE, warning=FALSE}
+plot_tree(GP.chl, color="SampleType", shape="Family", label.tips="Genus", size="Abundance")
+```
+Phylogenetic tree representation of the Chlamydiae species in the microbiome samples of the "Global Patterns" dataset([Caporaso 2011](http://www.pnas.org/content/early/2010/06/02/1000080107)).
+
+
+## Exploratory bar plots
+
+For further details, see the [plot_bar tutorial](http://joey711.github.io/phyloseq/plot_bar-examples.html)
+
+In the following example we use the included "enterotype" dataset ([Arumugam 2011](http://www.nature.com/nature/journal/v473/n7346/full/nature09944.html)).
+
+```{r}
+data(enterotype)
+```
+We start with a simple rank-abundance barplot, using the cumulative fractional abundance of each OTU in the dataset. In the enterotype dataset, the available published data are simplified as sample-wise fractional occurrences, rather than counts of individuals\footnote{Unfortunate, as this means we lose information about the total number of reads and associated confidences, ability to do more sophisticated richness estimates, etc. For example, knowing that we observed 1 sequence read of [...]
+
+```{r EntAbundPlot, fig.height=6, fig.width=8}
+par(mar = c(10, 4, 4, 2) + 0.1) # make more room on bottom margin
+N <- 30
+barplot(sort(taxa_sums(enterotype), TRUE)[1:N]/nsamples(enterotype), las=2)
+```
+An example exploratory barplot using base `R graphics and the `taxa_sums and `nsamples functions.
+
+Note that this first barplot is clipped at the `r N`th OTU. This was chosen because `ntaxa(enterotype) =``r ntaxa(enterotype)` OTUs would not be legible on the plot. As you can see, the relative abundances have decreased dramatically by the 10th-ranked OTU.
+
+So what are these OTUs? In the `enterotype` dataset, only a single taxonomic rank type is present:
+```{r}
+rank_names(enterotype)
+```
+This means the OTUs in this dataset have been grouped at the level of genera, and no other taxonomic grouping/transformation is possible without additional information (like might be present in a phylogenetic tree, or with further taxonomic classification analysis).
+
+We need to know which taxonomic rank classifiers, if any, we have available to specify in the second barplot function in this example, `plot_bar(). We have already observed how quickly the abundance decreases with rank, so wo we will subset the enterotype dataset to the most abundant `N taxa in order to make the barplot legible on this page.
+
+```{r}
+TopNOTUs <- names(sort(taxa_sums(enterotype), TRUE)[1:10])
+ent10 <- prune_taxa(TopNOTUs, enterotype)
+print(ent10)
+```
+
+Note also that there are `r nsamples(ent10)` samples in this dataset, and so a remaining challenge is to consolidate these samples into meaningful groups. A good place to look is the available sample variables, which in most cases will carry more "meaning" than the sample names alone.
+
+```{r}
+sample_variables(ent10)
+```
+
+The parameters to `plot_bar` in the following code-chunk were chosen after various trials. We suggest that you also try different parameter settings while you're exploring different features of the data. In addition to the variables names of `sample_data`, the `plot_bar()` function recognizes the names of taxonomic ranks (if present). See the help documentation and further details in the examples and on the wiki page. In this example we have also elected to organize data by "facets" (sep [...]
+
+```{r entbarplot0, fig.height=6, fig.width=10}
+plot_bar(ent10, "SeqTech", fill="Enterotype", facet_grid=~Genus)
+```
+
+An example exploratory bar plot using the `plot_bar` function. In this case we have faceted the data (abundance values) according to the genera of each OTU. The subset of OTUs that have not been assigned to a specific genus are in the `NA` panel. Within each facet, the data is further separated by sequencing technology, and each OTU is shaded according to the enterotype of the sample it form which it came. Abundance values from different samples and OTUs but having the same variables ma [...]
+
+Figure summarizes quantitatively the increased abundances of Bacteroides and Prevotella in the Enterotypes 1 and 2, respectively. Interestingly, a large relative abundance of Blautia was observed for Enterotype 3, but only from 454-pyrosequencing data sets, not the Illumina or Sanger datasets. This suggests the increased Blautia might actually be an artifact. Similarly, Prevotella appears to be one of the most abundant genera in the Illumina-sequenced samples among Enterotype 3, but this [...]
+
+
+
+
+# Exploratory analysis and graphics
+
+## Exploratory Heat Map
+
+For further details, see the [plot_heatmap tutorial](http://joey711.github.io/phyloseq/plot_heatmap-examples.html)
+
+As the number of taxa in a dataset gets very large, the ability to effectively display all of the elements of the data becomes compromised, and a heatmap representation is no exception. It can also be time-consuming to render. To address both these issues, we show an example in which we have subsetted the Global Patterns dataset to a manageable portion, in this case, the Crenarchaeota phylum.
+
+```{r GPheatmap}
+data("GlobalPatterns")
+gpac <- subset_taxa(GlobalPatterns, Phylum=="Crenarchaeota")
+(p <- plot_heatmap(gpac, "NMDS", "bray", "SampleType", "Family"))
+```
+
+What if you wanted to change the axis labels?
+
+```{r GPheatmap-rename-axes}
+p$scales$scales[[1]]$name <- "My X-Axis"
+p$scales$scales[[2]]$name <- "My Y-Axis"
+print(p)
+```
+
+Note that it is possible to order the sample/species indices
+by any of the ordination methods supported in the `ordinate` function;
+and also that the color scheme can be modified with additional arguments.
+
+
+Heat map representation of the Crenarchaeota phylum abundance pattern across different sample types in the Global Patterns dataset.
+
+
+## Microbiome Network Representation
+
+For further details, see the [plot_network tutorial](http://joey711.github.io/phyloseq/plot_network-examples.html)
+
+Continuing with the `enterotype` dataset, here are some examples for creating a custom network representation of the relationship between microbiome samples in an experiment. This relies heavily on the igraph and ggplot2 packages to create a network display of the "connectedness" of samples according to some user-provided ecological similarity. By default, points represent microbiom samples, and are determined using an algorithm that optimizes the clarity of the display of network "edges [...]
+
+In this example, the default dissimilarity index was used (Jaccard, co-occurrence), with a maximum distance of `0.3` required to create an edge. Any function that can operate on phyloseq-objects and return a sample-wise distance can be provided as the `dist.fun` argument, or a character string of the name of the distance function already supported in phyloseq. Other distances may result in very different clustering, and this is a choice that should be understood and not taken too lightly [...]
+
+Interestingly, at this level of analysis and parameter-settings the two major sub-graphs appear to be best explained by the sequencing technology and not the subject enterotype, suggesting that the choice of sequencing technology has a major effect on the microbial community one can observe. This seems to differ somewhat with the inferences described in the "enterotype" article ([Arumugam 2011](http://www.nature.com/nature/journal/v473/n7346/full/nature09944.html)). However, there could [...]
+
+```{r plot_sample_network, fig.width=11, fig.height=7, message=FALSE, warning=FALSE}
+data(enterotype)
+plot_net(enterotype, maxdist=0.4, color="SeqTech", shape="Enterotype")
+```
+
+Network representation of the relationship between microbiome samples in the "Enterotype" dataset ([Arumugam 2011](http://www.nature.com/nature/journal/v473/n7346/full/nature09944.html)).
+
+
+
+
+## Ordination Methods
+
+For further details, see the [plot_ordination tutorial](http://joey711.github.io/phyloseq/plot_ordination-examples.html)
+
+Ordination methods can be a useful tool for exploring complex phylogenetic sequencing data, particularly when the hypothesized structure of the data is poorly defined (or there isn't a hypothesis). The phyloseq package provides some useful tools for performing ordinations and plotting their results, via the `ordinate() and `plot_ordination() functions, respectively. Although there are many options and methods supported, a first-step will probably look something like the following:
+
+```{r eval=FALSE}
+my.physeq <- import("Biom", BIOMfilename="myBiomFile.biom")
+my.ord <- ordinate(my.physeq)
+plot_ordination(my.physeq, my.ord, color="myFavoriteVarible")
+```
+
+It is probably a good idea to read the documentation for these two functions, as they also provide links to related functions and additional examples you can try immediately on your own machine.
+```{r help-import, eval=FALSE}
+help(import)
+help(ordinate)
+help(distance)
+help(plot_ordination)
+```
+
+
+### Principal Coordinates Analysis (PCoA)
+
+We take as our first example, a reproduction of Figure 5 from the "Global Patterns" article([Caporaso 2011](http://www.pnas.org/content/early/2010/06/02/1000080107)). The authors show a 3-dimensional representation of the first three axes of a Principal Coordinates Analysis (PCoA; This is also sometimes referred to as "Multi-Dimensional Scaling", or "MDS") performed on the unweighted-UniFrac distance using all of the available sequences (their approach included both 5' and 3' sequences). [...]
+
+The following reproduces the unweighted UniFrac distance calculation on the full dataset. Note that this calculation can take a long time because of the large number of OTUs. Parallelization is recommended for large datasets, typically if they are as large as [Global Patterns](http://www.pnas.org/content/early/2010/06/02/1000080107), or larger. For details on parallelization, see the details section and examples in the `UniFrac()` documentation, and also the page dedicated to the topic o [...]
+
+http://joey711.github.io/phyloseq-demo/unifrac.html
+
+```{r GP-data-load}
+data(GlobalPatterns)
+```
+```{r, eval=FALSE}
+GPUF <- UniFrac(GlobalPatterns)
+```
+
+Load the pre-computed distance matrix, `GPUF`
+
+```{r load-precomputed-UF}
+load(system.file("doc", "Unweighted_UniFrac.RData", package="phyloseq"))
+```
+
+Calculate the PCoA on this distance matrix, `GPUF`.
+
+```{r}
+GloPa.pcoa = ordinate(GlobalPatterns, method="PCoA", distance=GPUF)
+```
+
+Before we look at the results, let's first investigate how much of the total distance structure we will capture in the first few axes. We can do this graphically with a "scree plot", an ordered barplot of the relative fraction of the total eigenvalues associated with each axis.
+
+```{r PCoAScree, fig.width=6, fig.height=4}
+plot_scree(GloPa.pcoa, "Scree plot for Global Patterns, UniFrac/PCoA")
+```
+
+Scree plot of the PCoA used to create Figure 5 from the "Global Patterns" article([Caporaso 2011](http://www.pnas.org/content/early/2010/06/02/1000080107)). The first three axes represent `r round(100*sum(GloPa.pcoa$values$Relative_eig[1:3]))`% of the total variation in the distances. Interestingly, the fourth axis represents another `r round(100*(GloPa.pcoa$values$Relative_eig[4]))`%, and so may warrant exploration as well. A scree plot is an important tool for any ordination method, as [...]
+
+Next, we will reproduce Figure 5 from the "Global Patterns" article([Caporaso 2011](http://www.pnas.org/content/early/2010/06/02/1000080107)), but separating the three axes into 2 plots using `plot_ordination()`.
+
+```{r GPfig5ax1213}
+(p12 <- plot_ordination(GlobalPatterns, GloPa.pcoa, "samples", color="SampleType") +
+ geom_point(size=5) + geom_path() + scale_colour_hue(guide = FALSE) )
+(p13 <- plot_ordination(GlobalPatterns, GloPa.pcoa, "samples", axes=c(1, 3),
+ color="SampleType") + geom_line() + geom_point(size=5) )
+```
+A reproduction in phyloseq / R of the main panel of Figure 5 from the "Global Patterns" article([Caporaso 2011](http://www.pnas.org/content/early/2010/06/02/1000080107)), on two plots. The horizontal axis represents the first axis in the PCoA ordination, while the top and bottom vertical axes represent the second and third axes, respectively. Different points represent different samples within the dataset, and are shaded according to the environment category to which they belong. The col [...]
+
+
+### non-metric Multi-Dimensional Scaling (NMDS)
+We repeat the previous example, but instead using non-metric multidimensional scaling (NMDS) limited to just two dimensions. This approach limits the amount of residual distance "not shown" in the first two (or three) axes, but forefeits some mathematical properties and does not always converge within the specified number of axes.
+
+```{r GP_UF_NMDS0}
+# (Re)load UniFrac distance matrix and GlobalPatterns data
+data(GlobalPatterns)
+load(system.file("doc", "Unweighted_UniFrac.RData", package="phyloseq"))
+# perform NMDS, set to 2 axes
+GP.NMDS <- ordinate(GlobalPatterns, "NMDS", GPUF)
+(p <- plot_ordination(GlobalPatterns, GP.NMDS, "samples", color="SampleType") +
+ geom_line() + geom_point(size=5) )
+```
+An example exploratory ordination using non-metric multidimensional scaling (NMDS) on the unweighted UniFrac distance between samples of the "Global Patterns" dataset. Sample points are shaded by environment type, and connected by a line if they belong to the same type. Compare with Figure 5 from the "Global Patterns" article([Caporaso 2011](http://www.pnas.org/content/early/2010/06/02/1000080107)).
+
+The figure nicely shows the relative dissimilarities between microbial communities from different habitats. However, it fails to indicate what was different between the communities. For an ordination method that provides information on the taxa that explain differences between samples (or groups of samples), we use Correspondence Analysis.
+
+### Correspondence Analysis (CA)
+
+In the following section we will show continue our exploration of the "GlobalPatterns" dataset using various features of an ordination method called Correspondence Analysis. We give special emphasis to exploratory interpretations using the biplot, because it provides additional information that is not available from PCoA or NMDS.
+
+Let's start by performing a Correspondence Analysis and investigating the scree plot. Both interestingly and challengingly, the scree plot suggests that the [Global Patterns](http://www.pnas.org/content/early/2010/06/02/1000080107) abundance data is quite high-dimensional, with the first two CA axes accounting for not quite 17% of the total (chi-square) variability. Note the absence of a steep decline in eigenvalue fraction as axis number increases. Each additional axis represents only m [...]
+
+First, let's severely subset the number of species for the sake of run-time.\footnote{This is for illustration purposes only, do not repeat unless you are very sure you have a good reason for doing this.
+
+```{r GPCAscree0, fig=FALSE}
+data(GlobalPatterns)
+# Take a subset of the GP dataset, top 200 species
+topsp <- names(sort(taxa_sums(GlobalPatterns), TRUE)[1:200])
+GP <- prune_taxa(topsp, GlobalPatterns)
+# Subset further to top 5 phyla, among the top 200 OTUs.
+top5ph <- sort(tapply(taxa_sums(GP), tax_table(GP)[, "Phylum"], sum), decreasing=TRUE)[1:5]
+GP <- subset_taxa(GP, Phylum %in% names(top5ph))
+# Re-add human variable to sample data:
+sample_data(GP)$human <- factor(human)
+```
+
+Now perform the correspondence analysis.
+
+```{r GPCAscree, fig.width=8, fig.height=5}
+# Now perform a unconstrained correspondence analysis
+gpca <- ordinate(GP, "CCA")
+# Scree plot
+plot_scree(gpca, "Scree Plot for Global Patterns Correspondence Analysis")
+```
+The correspondence analysis (CA) scree plot of the "Global Patterns" dataset.
+
+Now let's investigate how the samples behave on the first few CA axes.
+
+```{r GPCA1234}
+(p12 <- plot_ordination(GP, gpca, "samples", color="SampleType") +
+ geom_line() + geom_point(size=5) )
+(p34 <- plot_ordination(GP, gpca, "samples", axes=c(3, 4), color="SampleType") +
+ geom_line() + geom_point(size=5) )
+```
+First 4 axes of Correspondence Analysis (CA) of the "Global Patterns" dataset ([Caporaso 2011](http://www.pnas.org/content/early/2010/06/02/1000080107)).
+
+A clear feature of these plots is that the feces and mock communities cluster tightly together, far away from all other samples on the first axis (CA1). The skin and tongue samples separate similarly, but on the second axis. Taken together, it appears that the first two axes are best explained by the separation of human-associated "environments" from the other non-human environments in the dataset, with a secondary separation of tongue and skin samples from feces.
+
+We will now investigate further this top-level structure of the data, using an additional feature of correspondence analysis that allows us to compare the relative contributions of individual taxa on the same graphical space: the "biplot". However, because we just displayed the position of samples in the ordination and there are often many thousands of OTUs, we will focus on creating an interpretable plot of the OTUs. For creating graphics that combine the two plots, try the `"biplot"` o [...]
+
+```{r GPCAspecplot0}
+p1 <- plot_ordination(GP, gpca, "species", color="Phylum")
+(p1 <- ggplot(p1$data, p1$mapping) + geom_point(size=5, alpha=0.5) +
+ facet_wrap(~Phylum) + scale_colour_hue(guide = FALSE) )
+```
+
+Species plot of the "Global Patterns" correspondence analysis first two axes, with each phylum on a different panel ("facet"). Only the most abundant 5 phyla among the most abundant 200 taxa (cumulative, all samples) are included. Arbitrary reduction, for computational efficiency of example.
+
+Let's try drawing the figure again, only this time summarizing the species points as a 2D density estimate, without any individual points.
+
+```{r GPCAspecplotTopo0}
+(p3 <- ggplot(p1$data, p1$mapping) + geom_density2d() +
+ facet_wrap(~Phylum) + scale_colour_hue(guide = FALSE) )
+```
+Redrawn figure, which is severely overplotted, as a 2-dimensional species-density topographic map, faceted in the same way.
+
+These figures reveal some useful patterns and interesting outliers, but what if we want a complete summary of how each phylum is represented along each axis? The following code is a way to show this using boxplots, while still avoiding the occlusion problem (points layered on top of each other), and also conveying some useful information about the pattern of taxa that contribute to the separation of human-associated samples from the other sample types. It re-uses the data that was stored [...]
+
+```{r GPCAjitter0}
+library("reshape2")
+# Melt the species-data.frame, DF, to facet each CA axis separately
+mdf <- melt(p1$data[, c("CA1", "CA2", "Phylum", "Family", "Genus")],
+ id=c("Phylum", "Family", "Genus") )
+# Select some special outliers for labelling
+LF <- subset(mdf, variable=="CA2" & value < -1.0)
+# build plot: boxplot summaries of each CA-axis, with labels
+p <- ggplot(mdf, aes(Phylum, value, color=Phylum)) +
+ geom_boxplot() +
+ facet_wrap(~variable, 2) +
+ scale_colour_hue(guide = FALSE) +
+ theme_bw() +
+ theme( axis.text.x = element_text(angle = -90, vjust = 0.5) )
+# Add the text label layer, and render ggplot graphic
+(p <- p + geom_text(data=subset(LF, !is.na(Family)),
+ mapping = aes(Phylum, value+0.1, color=Phylum, label=Family),
+ vjust=0,
+ size=2))
+```
+Boxplot of taxa (species in this case) of the "Global Patterns" CA first two axes, shaded/separated by phylum. Through this approach it is much easier to see particular species that cluster unusually relative to the rest of their phylum, for example the Bacteroidetes species (Prevotellaceae family) that is positioned most in the negative CA2 direction toward the Tongue/Skin samples.
+
+One way to relate some of the high-level patterns we observed from correspondence analysis is to directly visualize the abundances in an organized, quantitative way, to see if this does in fact support / explain the human/environment microbiome differences. Here is an example using the `plot_bar` function described in an earlier section.
+
+```{r GPtaxaplot0}
+plot_bar(GP, x="human", fill="SampleType", facet_grid= ~ Phylum)
+```
+Phylum-level comparison of relative abundance of taxa in samples that are from human microbiomes (or not).
+
+In this figure we've used the `threshold` parameter to omit all but phyla accounting for the top 90% of phyla in any one sample. Some patterns emerging from this display appear to be: (1) Cyanobacteria, Actinobacteria appear under-represented in human samples; (2) conversely, Firmicutes appear over-represented in human samples; (3) Acidobacteria, Verrucomicrobia appear over-represented in the fecal samples; (4) the only Crenarchaeota were observed in the Mock sample, which is not really [...]
+
+
+### Double Principle Coordinate Analysis (DPCoA)
+
+Here is a quick example illustrating the use of Double Principal Coordinate Analysis (DPCoA~\cite{Pavoine2004523), using the using the `ordinate()` function in phyloseq, as well as the "biplot" option for `plot_ordination(). For a description that includes an applied example using the "enterotype" dataset and comparison with UniFrac/PCoA, see Fukuyama et al~\cite{fukuyama2012com.
+
+```{r GPdpcoa01}
+GP.dpcoa <- ordinate(GP, "DPCoA")
+pdpcoa <- plot_ordination(GP, GP.dpcoa, type="biplot",
+ color="SampleType", shape="Phylum")
+shape.fac <- pdpcoa$data[, deparse(pdpcoa$mapping$shape)]
+man.shapes <- c(19, 21:25)
+names(man.shapes) <- c("Samples", levels(shape.fac)[levels(shape.fac)!="Samples"])
+p2dpcoa <- pdpcoa + scale_shape_manual(values=man.shapes)
+```
+
+A biplot representation of a Double Principal Coordinate Analysis (DPCoA), on a simplified version of the "Global Patterns" dataset with only the most abundant 200 OTUs included.
+
+```{r GPdpcoa02}
+# Show just Samples or just Taxa
+plot_ordination(GP, GP.dpcoa, type="taxa", shape="Phylum")
+plot_ordination(GP, GP.dpcoa, type="samples", color="SampleType")
+# Split
+plot_ordination(GP, GP.dpcoa, type="split",
+ color="SampleType", shape="Phylum") +
+ ggplot2::scale_colour_discrete()
+```
+
+
+## Distance Methods
+
+### distance(): Central Distance Function
+
+Many comparisons of microbiome samples, including the graphical model and the PCoA analysis, require a calculation for the relative dissimilarity/distance between one microbial community and another. The phyloseq-package provides a general "wrapper" function for calculating ecological distance matrices between the samples in an experiment.
+
+`distance()` currently supports 43 method options, as well as user-provided arbitrary methods via an interface to vegan's `designdist()` function. Currrently only sample-wise distances are supported (the `type` argument), but eventually species-wise (OTU-wise) distances will be supported as well. In addition to supporting any of the method options to the three main distance functions of the vegan-package~\cite{veganpkg --- including the 14 distances of the `vegdist()` function and all 24 [...]
+
+The function takes a `phyloseq-class` object and an argument indicating the distance type; and it returns a `dist-class distance matrix.
+
+```{r distancefun}
+data(esophagus)
+distance(esophagus, "bray")
+distance(esophagus, "wunifrac") # weighted UniFrac
+distance(esophagus, "jaccard") # vegdist jaccard
+distance(esophagus, "g") # betadiver method option "g"
+```
+
+
+### UniFrac and weighted UniFrac
+UniFrac is a recently-defined~\cite{Lozupone:2005gn and popular distance metric to summarize the difference between pairs of ecological communities. All UniFrac variants use a phylogenetic tree of the relationship among taxa as central information to calculating the distance between two samples/communities. An unweighted UniFrac distance matrix only considers the presence/absence of taxa, while weighted UniFrac accounts for the relative abundance of taxa as well as their phylogenetic dis [...]
+
+The following is an example calculating the UniFrac distance (both weighted and unweighted) matrix using the "esophagus" example dataset:
+
+```{r eval=FALSE, echo=TRUE}
+data(esophagus)
+distance(esophagus, "wUniFrac")
+distance(esophagus, "uUniFrac")
+```
+
+See the phyloseq demo page about fast parallel UniFrac.
+
+
+## Hierarchical Clustering
+Another potentially useful and popular way to visualize/decompose sample-distance matrices is through hierarchical clustering (e.g. `hclust`). In the following example, we reproduce Figure~4 from the ["Global Patterns" article](http://www.pnas.org/content/early/2010/06/02/1000080107), using the unweighted UniFrac distance and the UPGMA method (`hclust` parameter `method="average"`). Try `help("hclust")` for alternative clustering methods included in standard R.
+
+```{r}
+# (Re)load UniFrac distance matrix and GlobalPatterns data
+data(GlobalPatterns)
+load(system.file("doc", "Unweighted_UniFrac.RData", package="phyloseq"))
+# Manually define color-shading vector based on sample type.
+colorScale <- rainbow(length(levels(get_variable(GlobalPatterns, "SampleType"))))
+cols <- colorScale[get_variable(GlobalPatterns, "SampleType")]
+GP.tip.labels <- as(get_variable(GlobalPatterns, "SampleType"), "character")
+# This is the actual hierarchical clustering call, specifying average-link clustering
+GP.hclust <- hclust(GPUF, method="average")
+plot(GP.hclust, col=cols)
+```
+
+An alternative means of summarizing a distance matrix via hierarchical clustering and plotting as an annotated dendrogram. Compare with Figure 4 from the [Global Patterns](http://www.pnas.org/content/early/2010/06/02/1000080107)). Some differences in Figure~\ref{fig:GPfig4 from the original article might be explained by [Global Patterns](http://www.pnas.org/content/early/2010/06/02/1000080107) in phyloseq being the summed observations from both primer directions (5' and 3'), while in the [...]
+
+
+# Multiple Testing and Differential Abundance
+
+One of our recommended approaches to this problem was described in
+McMurdie and Holmes (2014) [Waste Not, Want Not: Why Rarefying Microbiome Data is Inadmissible](http://dx.plos.org/10.1371/journal.pcbi.1003531).
+PLoS Computational Biology. 10(4):e1003531
+
+Some reproducible demonstrations of this approach are included in
+[the phyloseq extensions repository](http://joey711.github.io/phyloseq-extensions/extensions-index.html), the `phyloseq_to_deseq2` function,
+as well as a separate vignetted dedicated to this topic
+(phyloseq and DESeq2 on Colorectal Cancer Data).
+
+Please make use of these materials for differential abundance testing.
diff --git a/inst/doc/phyloseq-analysis.html b/inst/doc/phyloseq-analysis.html
new file mode 100644
index 0000000..abb56dc
--- /dev/null
+++ b/inst/doc/phyloseq-analysis.html
@@ -0,0 +1,463 @@
+<!DOCTYPE html>
+
+<html xmlns="http://www.w3.org/1999/xhtml">
+
+<head>
+
+<meta charset="utf-8">
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+<meta name="generator" content="pandoc" />
+
+
+
+<title>Vignette for phyloseq: Analysis of high-throughput microbiome census data</title>
+
+<link href="data:text/css;charset=utf-8,pre%20%2Eoperator%2C%0Apre%20%2Eparen%20%7B%0Acolor%3A%20rgb%28104%2C%20118%2C%20135%29%0A%7D%0Apre%20%2Eliteral%20%7B%0Acolor%3A%20%23990073%0A%7D%0Apre%20%2Enumber%20%7B%0Acolor%3A%20%23099%3B%0A%7D%0Apre%20%2Ecomment%20%7B%0Acolor%3A%20%23998%3B%0Afont%2Dstyle%3A%20italic%0A%7D%0Apre%20%2Ekeyword%20%7B%0Acolor%3A%20%23900%3B%0Afont%2Dweight%3A%20bold%0A%7D%0Apre%20%2Eidentifier%20%7B%0Acolor%3A%20rgb%280%2C%200%2C%200%29%3B%0A%7D%0Apre%20%2Estri [...]
+<script src="data:application/x-javascript;base64,dmFyIGhsanM9bmV3IGZ1bmN0aW9uKCl7ZnVuY3Rpb24gbShwKXtyZXR1cm4gcC5yZXBsYWNlKC8mL2dtLCImYW1wOyIpLnJlcGxhY2UoLzwvZ20sIiZsdDsiKX1mdW5jdGlvbiBmKHIscSxwKXtyZXR1cm4gUmVnRXhwKHEsIm0iKyhyLmNJPyJpIjoiIikrKHA/ImciOiIiKSl9ZnVuY3Rpb24gYihyKXtmb3IodmFyIHA9MDtwPHIuY2hpbGROb2Rlcy5sZW5ndGg7cCsrKXt2YXIgcT1yLmNoaWxkTm9kZXNbcF07aWYocS5ub2RlTmFtZT09IkNPREUiKXtyZXR1cm4gcX1pZighKHEubm9kZVR5cGU9PTMmJnEubm9kZVZhbHVlLm1hdGNoKC9ccysvKSkpe2JyZWFrfX19ZnVuY3Rpb24gaCh0LH [...]
+
+<style type="text/css">code{white-space: pre;}</style>
+<style type="text/css">
+ pre:not([class]) {
+ background-color: white;
+ }
+</style>
+<script type="text/javascript">
+if (window.hljs && document.readyState && document.readyState === "complete") {
+ window.setTimeout(function() {
+ hljs.initHighlighting();
+ }, 0);
+}
+</script>
+
+
+<link href="data:text/css;charset=utf-8,body%2C%20td%20%7B%0Afont%2Dfamily%3A%20sans%2Dserif%3B%0Abackground%2Dcolor%3A%20white%3B%0Afont%2Dsize%3A%2013px%3B%0A%7D%0Abody%20%7B%0Amax%2Dwidth%3A%20800px%3B%0Amargin%3A%200%20auto%3B%0Apadding%3A%201em%201em%202em%3B%0Aline%2Dheight%3A%2020px%3B%0A%7D%0A%0Adiv%23TOC%20li%20%7B%0Alist%2Dstyle%3Anone%3B%0Abackground%2Dimage%3Anone%3B%0Abackground%2Drepeat%3Anone%3B%0Abackground%2Dposition%3A0%3B%0A%7D%0A%0Ap%2C%20pre%20%7B%20margin%3A%200em%2 [...]
+
+<script type="text/javascript">
+document.addEventListener("DOMContentLoaded", function() {
+ var links = document.links;
+ for (var i = 0, linksLength = links.length; i < linksLength; i++)
+ if(links[i].hostname != window.location.hostname)
+ links[i].target = '_blank';
+});
+</script>
+
+</head>
+
+<body>
+
+
+<div id="header">
+<h1 class="title">Vignette for phyloseq: Analysis of high-throughput microbiome census data</h1>
+</div>
+
+<h1>Contents</h1>
+<div id="TOC">
+<ul>
+<li><a href="#other-resources"><span class="toc-section-number">1</span> Other resources</a></li>
+<li><a href="#summary"><span class="toc-section-number">2</span> Summary</a></li>
+<li><a href="#about-this-vignette"><span class="toc-section-number">3</span> About this vignette</a></li>
+<li><a href="#data"><span class="toc-section-number">4</span> Data</a><ul>
+<li><a href="#interface-with-the-microbio.meqiime-server"><span class="toc-section-number">4.1</span> Interface with the microbio.me/qiime server</a></li>
+<li><a href="#included-data"><span class="toc-section-number">4.2</span> Included Data</a></li>
+</ul></li>
+<li><a href="#simple-exploratory-graphics"><span class="toc-section-number">5</span> Simple exploratory graphics</a><ul>
+<li><a href="#easy-richness-estimates"><span class="toc-section-number">5.1</span> Easy Richness Estimates</a></li>
+<li><a href="#exploratory-tree-plots"><span class="toc-section-number">5.2</span> Exploratory tree plots</a></li>
+<li><a href="#exploratory-bar-plots"><span class="toc-section-number">5.3</span> Exploratory bar plots</a></li>
+</ul></li>
+<li><a href="#exploratory-analysis-and-graphics"><span class="toc-section-number">6</span> Exploratory analysis and graphics</a><ul>
+<li><a href="#exploratory-heat-map"><span class="toc-section-number">6.1</span> Exploratory Heat Map</a></li>
+<li><a href="#microbiome-network-representation"><span class="toc-section-number">6.2</span> Microbiome Network Representation</a></li>
+<li><a href="#ordination-methods"><span class="toc-section-number">6.3</span> Ordination Methods</a></li>
+<li><a href="#distance-methods"><span class="toc-section-number">6.4</span> Distance Methods</a></li>
+<li><a href="#hierarchical-clustering"><span class="toc-section-number">6.5</span> Hierarchical Clustering</a></li>
+</ul></li>
+<li><a href="#multiple-testing-and-differential-abundance"><span class="toc-section-number">7</span> Multiple Testing and Differential Abundance</a></li>
+</ul>
+</div>
+
+<!--
+%% \VignetteEngine{knitr::rmarkdown}
+%% \VignetteIndexEntry{analysis vignette}
+-->
+<p>Paul J. McMurdie and Susan Holmes</p>
+<p><a href="mailto:mcmurdie at stanford.edu">mcmurdie at stanford.edu</a></p>
+<p><a href="http://joey711.github.io/phyloseq/">phyloseq Home Page</a></p>
+<p>If you find phyloseq and/or its tutorials useful, please acknowledge and cite phyloseq in your publications:</p>
+<p><strong>phyloseq: An R package for reproducible interactive analysis and graphics of microbiome census data</strong> (2013) PLoS ONE 8(4):e61217 <a href="http://dx.plos.org/10.1371/journal.pone.0061217" class="uri">http://dx.plos.org/10.1371/journal.pone.0061217</a></p>
+<div id="other-resources" class="section level1">
+<h1><span class="header-section-number">1</span> Other resources</h1>
+<p>The phyloseq project also has a number of supporting online resources, most of which can by found at <a href="http://joey711.github.com/phyloseq/">the phyloseq home page</a>, or from the phyloseq stable release <a href="http://bioconductor.org/packages/release/bioc/html/phyloseq.html">page on Bioconductor</a>.</p>
+<p>To post feature requests or ask for help, try <a href="https://github.com/joey711/phyloseq/issues">the phyloseq Issue Tracker</a>.</p>
+</div>
+<div id="summary" class="section level1">
+<h1><span class="header-section-number">2</span> Summary</h1>
+<p>The analysis of microbiological communities brings many challenges: the integration of many different types of data with methods from ecology, genetics, phylogenetics, network analysis, visualization and testing. The data itself may originate from widely different sources, such as the microbiomes of humans, soils, surface and ocean waters, wastewater treatment plants, industrial facilities, and so on; and as a result, these varied sample types may have very different forms and scales [...]
+</div>
+<div id="about-this-vignette" class="section level1">
+<h1><span class="header-section-number">3</span> About this vignette</h1>
+<p>A separate vignette is included within the phyloseq-package that describes the basics of importing pre-clustered phylogenetic sequencing data, data filtering, as well as some transformations and some additional details about the package and installation. A quick way to load it is:</p>
+<pre class="r"><code>vignette("phyloseq-basics")</code></pre>
+<p>By contrast, this vignette is intended to provide functional examples of the analysis tools and wrappers included in phyloseq. All necessary code for performing the analysis and producing graphics will be included with its description, and the focus will be on the use of example data that is included and documented within the phyloseq-package.</p>
+<p>Let’s start by loading the `phyloseq-package:</p>
+<pre class="r"><code>library("phyloseq")
+library("ggplot2")</code></pre>
+<p>And because we will show examples of custom modifications to ggplot2 plots, we also loaded ggplot2 as well. Here I’ll set as default my favorite ggplot2 theme. These are completely optional, and modifiable.</p>
+<pre class="r"><code>theme_set(theme_bw())</code></pre>
+</div>
+<div id="data" class="section level1">
+<h1><span class="header-section-number">4</span> Data</h1>
+<div id="interface-with-the-microbio.meqiime-server" class="section level2">
+<h2><span class="header-section-number">4.1</span> Interface with the microbio.me/qiime server</h2>
+<p>See the <a href="http://joey711.github.io/phyloseq/download-microbio.me.html">microbio_me_qiime tutorial</a> for more details and examples downloading and importing into phyloseq/R directly from this public database.</p>
+</div>
+<div id="included-data" class="section level2">
+<h2><span class="header-section-number">4.2</span> Included Data</h2>
+<p>To facilitate testing and exploration of tools in phyloseq, this package includes example data from published studies. Many of the examples in this vignette use either the <a href="http://www.pnas.org/content/early/2010/06/02/1000080107">Global Patterns</a> or <code>enterotype</code> datasets as source data. The <a href="http://www.pnas.org/content/early/2010/06/02/1000080107">Global Patterns</a> data was described in a <a href="http://www.pnas.org/content/early/2010/06/02/1000080107" [...]
+<p>Because this data is included in the package, the examples can easily be run on your own computer using the code shown in this vignette. The data is loaded into memory using the <code>data</code> command. Let’s start by loading the <a href="http://www.pnas.org/content/early/2010/06/02/1000080107">Global Patterns</a> data.</p>
+<pre class="r"><code>data(GlobalPatterns)</code></pre>
+<p>Later on we will use an additional categorical designation — human versus non-human associated samples — that was not in the original dataset. Now is a good time to add it as an explicit variable of the <code>sample_data</code>, and because we don’t want to type long words over and over, we’ll choose a shorter name for this modified version of <a href="http://www.pnas.org/content/early/2010/06/02/1000080107">Global Patterns</a>, call it <code>GP</code>, and also remove a handful of ta [...]
+<pre class="r"><code># prune OTUs that are not present in at least one sample
+GP <- prune_taxa(taxa_sums(GlobalPatterns) > 0, GlobalPatterns)
+# Define a human-associated versus non-human categorical variable:
+human <- get_variable(GP, "SampleType") %in% c("Feces", "Mock", "Skin", "Tongue")
+# Add new human variable to sample data:
+sample_data(GP)$human <- factor(human)</code></pre>
+</div>
+</div>
+<div id="simple-exploratory-graphics" class="section level1">
+<h1><span class="header-section-number">5</span> Simple exploratory graphics</h1>
+<div id="easy-richness-estimates" class="section level2">
+<h2><span class="header-section-number">5.1</span> Easy Richness Estimates</h2>
+<p>For further details, see the <a href="http://joey711.github.io/phyloseq/plot_richness-examples.html">plot_richness tutorial</a></p>
+<p>We can easily create a complex graphic that compares the richness estimates of samples from different environment types in the <a href="http://www.pnas.org/content/early/2010/06/02/1000080107">Global Patterns</a> dataset, using the <code>plot_richness</code> function. Note that it is important to use raw (untrimmed) OTU-clustered data when performing richness estimates, as they can be highly dependent on the number of singletons in a sample.</p>
+<pre class="r"><code>alpha_meas = c("Observed", "Chao1", "ACE", "Shannon", "Simpson", "InvSimpson")
+(p <- plot_richness(GP, "human", "SampleType", measures=alpha_meas))</code></pre>
+<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAABOAAAAKgCAIAAAC9UZVxAAAACXBIWXMAAA7DAAAOwwHHb6hkAAAgAElEQVR4nOzdZ3wU5frw8Wu2ZzedhNAJndAhIEVEFFBEbAhWxAaiYENBPaLPOYIIihxQESt/BAuichTFgoBHFGyR0LsQpAZI2PTsZss8L5YTQ2gJbmZ2k9/3w4uZa2f3vrLM7sy1c899K6qqCgAAAAAAejPonQAAAAAAACIUqAAAAACAEEGBCgAAAAAICRSoAAAAAICQQIEKAAAAAAgJFKgAAAAAgJBg0juBv2vatGmZmZl6Z4FzsNlszzzzjNVq1aX1GTNm7N+/X5emUXE2m23ixIlRUVG6tP7yyy/v2bNHl6ZRcTab7bHHHouPjz+/p7/++uvbt28PbkoIOpvNNm7cuKSkpKp48bfffnvz5s1V8coIIpvNNnbs2IYNG [...]
+<p>Add a ggplot2 box plot layer to the previous plot</p>
+<pre class="r"><code>p + geom_boxplot(data=p$data, aes(x=human, y=value, color=NULL), alpha=0.1)</code></pre>
+<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAABOAAAAKgCAIAAAC9UZVxAAAACXBIWXMAAA7DAAAOwwHHb6hkAAAgAElEQVR4nOzdeXwT1fo/8GdmsjVputEFylYoW9ltQSgiooAgoCKrIqAiuICoKIiX5XeViqDoBRFRUL/IIgiKiuJVZFFWkaWUfael0IU2bbqkWZrMzO+PcGtpC6RNOpM0n/fLl6+Zk8k5T8JpMk/mzDmMKIoEAAAAAAAAIDdW7gAAAAAAAAAAiJCgAgAAAAAAgJdAggoAAAAAAABeAQkqAAAAAAAAeAUkqAAAAAAAAOAVkKACAAAAAACAV1DIHYC7FixYkJ2dLXcUcAcajebtt99Wq9WytP7hhx9evXpVlqbBdRqNZtasWXq9XpbWlyxZcvnyZVmaBtdpNJo33ngjLCysZk//7LPPzp4969mQwOM0Gs3UqVOjoqJqo/Ivv [...]
+</div>
+<div id="exploratory-tree-plots" class="section level2">
+<h2><span class="header-section-number">5.2</span> Exploratory tree plots</h2>
+<p>For further details, see the <a href="http://joey711.github.io/phyloseq/plot_tree-examples.html">plot_tree tutorial</a></p>
+<p>phyloseq also contains a method for easily plotting an annotated phylogenetic tree with information regarding the sample in which a particular taxa was observed, and optionally the number of individuals that were observed.</p>
+<p>For the sake of creating a readable tree, let’s subset the data to just the <a href="http://en.wikipedia.org/wiki/Chlamydiae">Chlamydiae</a> phylum, which consists of obligate intracellular pathogens and is present in only a subset of environments in this dataset.</p>
+<pre class="r"><code>GP.chl <- subset_taxa(GP, Phylum=="Chlamydiae")</code></pre>
+<p>And now we will create the tree graphic form this subset of <a href="http://www.pnas.org/content/early/2010/06/02/1000080107">Global Patterns</a>, shading by the “`SampleType” variable, which indicates the environment category from which the microbiome samples originated. The following command also takes the option of labeling the number of individuals observed in each sample (if at all) of each taxa. The symbols are slightly enlarged as the number of individuals increases.</p>
+<pre class="r"><code>plot_tree(GP.chl, color="SampleType", shape="Family", label.tips="Genus", size="Abundance")</code></pre>
+<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAABaAAAAKgCAIAAADiZU0RAAAACXBIWXMAAA7DAAAOwwHHb6hkAAAgAElEQVR4nOzde1yMef8/8M/URMfpfFRKUlOiNiId6eCOEAnt6i4rdrFuNi0ioRw2i7D3olhK60s5JseiRQdSUSzVndJKKlR0Tqf5/XF9v/ObO82YUk1Tr+fj/mOuz7yvz+d9Xdfeduftc30+NBaLRQAAAAAAAAAAhJmIoBMAAAAAAAAAAPhSKHAAAAAAAAAAgNBDgQMAAAAAAAAAhB4KHAAAAAAAAAAg9FDgAAAAAAAAAAChhwIHAAAAAAAAAAg9Ou+vm5ubN2/e3DepAAAAAAAAABBCvvvuO11dXUFnAULmMwWOtrY2QkhISEifJAMAAAAAAACDWkFBgYeHx/z58wWdCAgfvKICAAAAAAAAAEIPBQ4AAAAAAAAAEHooc [...]
+</div>
+<div id="exploratory-bar-plots" class="section level2">
+<h2><span class="header-section-number">5.3</span> Exploratory bar plots</h2>
+<p>For further details, see the <a href="http://joey711.github.io/phyloseq/plot_bar-examples.html">plot_bar tutorial</a></p>
+<p>In the following example we use the included “enterotype” dataset (<a href="http://www.nature.com/nature/journal/v473/n7346/full/nature09944.html">Arumugam 2011</a>).</p>
+<pre class="r"><code>data(enterotype)</code></pre>
+<p>We start with a simple rank-abundance barplot, using the cumulative fractional abundance of each OTU in the dataset. In the enterotype dataset, the available published data are simplified as sample-wise fractional occurrences, rather than counts of individuals\footnote{Unfortunate, as this means we lose information about the total number of reads and associated confidences, ability to do more sophisticated richness estimates, etc. For example, knowing that we observed 1 sequence read [...]
+<pre class="r"><code>par(mar = c(10, 4, 4, 2) + 0.1) # make more room on bottom margin
+N <- 30
+barplot(sort(taxa_sums(enterotype), TRUE)[1:N]/nsamples(enterotype), las=2)</code></pre>
+<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAwAAAAJACAMAAAANcPFkAAADAFBMVEUAAAABAQECAgIDAwMEBAQFBQUGBgYHBwcICAgJCQkKCgoLCwsMDAwNDQ0ODg4PDw8QEBARERESEhITExMUFBQVFRUWFhYXFxcYGBgZGRkaGhobGxscHBwdHR0eHh4fHx8gICAhISEiIiIjIyMkJCQlJSUmJiYnJycoKCgpKSkqKiorKyssLCwtLS0uLi4vLy8wMDAxMTEyMjIzMzM0NDQ1NTU2NjY3Nzc4ODg5OTk6Ojo7Ozs8PDw9PT0+Pj4/Pz9AQEBBQUFCQkJDQ0NERERFRUVGRkZHR0dISEhJSUlKSkpLS0tMTExNTU1OTk5PT09QUFBRUVFSUlJTU1NUVFRVVVVWVlZXV1dYWFhZWVlaWlpbW1tcXFxdXV1eXl5fX19gYGBhYWFiYmJjY2NkZ [...]
+<p>Note that this first barplot is clipped at the 30th OTU. This was chosen because <code>ntaxa(enterotype) =</code>553 OTUs would not be legible on the plot. As you can see, the relative abundances have decreased dramatically by the 10th-ranked OTU.</p>
+<p>So what are these OTUs? In the <code>enterotype</code> dataset, only a single taxonomic rank type is present:</p>
+<pre class="r"><code>rank_names(enterotype)</code></pre>
+<pre><code>## [1] "Genus"</code></pre>
+<p>This means the OTUs in this dataset have been grouped at the level of genera, and no other taxonomic grouping/transformation is possible without additional information (like might be present in a phylogenetic tree, or with further taxonomic classification analysis).</p>
+<p>We need to know which taxonomic rank classifiers, if any, we have available to specify in the second barplot function in this example, <code>plot_bar(). We have already observed how quickly the abundance decreases with rank, so wo we will subset the enterotype dataset to the most abundant</code>N taxa in order to make the barplot legible on this page.</p>
+<pre class="r"><code>TopNOTUs <- names(sort(taxa_sums(enterotype), TRUE)[1:10])
+ent10 <- prune_taxa(TopNOTUs, enterotype)
+print(ent10)</code></pre>
+<pre><code>## phyloseq-class experiment-level object
+## otu_table() OTU Table: [ 10 taxa and 280 samples ]
+## sample_data() Sample Data: [ 280 samples by 9 sample variables ]
+## tax_table() Taxonomy Table: [ 10 taxa by 1 taxonomic ranks ]</code></pre>
+<p>Note also that there are 280 samples in this dataset, and so a remaining challenge is to consolidate these samples into meaningful groups. A good place to look is the available sample variables, which in most cases will carry more “meaning” than the sample names alone.</p>
+<pre class="r"><code>sample_variables(ent10)</code></pre>
+<pre><code>## [1] "Enterotype" "Sample_ID" "SeqTech" "SampleID"
+## [5] "Project" "Nationality" "Gender" "Age"
+## [9] "ClinicalStatus"</code></pre>
+<p>The parameters to <code>plot_bar</code> in the following code-chunk were chosen after various trials. We suggest that you also try different parameter settings while you’re exploring different features of the data. In addition to the variables names of <code>sample_data</code>, the <code>plot_bar()</code> function recognizes the names of taxonomic ranks (if present). See the help documentation and further details in the examples and on the wiki page. In this example we have also elect [...]
+<pre class="r"><code>plot_bar(ent10, "SeqTech", fill="Enterotype", facet_grid=~Genus)</code></pre>
+<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAA8AAAAJACAIAAADNc5igAAAACXBIWXMAAA7DAAAOwwHHb6hkAAAgAElEQVR4nOzdd3wUdf4/8M/WbHbTCwlphFBC6AqIBZDqoYAF5fQU7CLnnYV24CEqBwLSRDk9RfkqCKKiCNaTIk2RkgQQUsBAQgrpySbby+z8/pjv7Te/7O7s7Mxsyd7r+XjweITNfD7znpn3zr4z+5nPSGiaJgAAAAAAwI002AEAAAAAAHQlKKABAAAAAHyAAhoAAAAAwAcooAEAAAAAfIACGgAAAADAByigAQAAAAB8IA92AP9n9erVdXV1wY7CX2JjY5ctW9bpxaNHj+7evTso8QTG7Nmz+/fv3/GV9vb2l19+OVjxBMANN9zw4IMPdnrxzTffLC8vD0o8AaBSqV577TWZTNbxxfz8/O3btwcrpACYNWvWsGHDOr5CU [...]
+<p>An example exploratory bar plot using the <code>plot_bar</code> function. In this case we have faceted the data (abundance values) according to the genera of each OTU. The subset of OTUs that have not been assigned to a specific genus are in the <code>NA</code> panel. Within each facet, the data is further separated by sequencing technology, and each OTU is shaded according to the enterotype of the sample it form which it came. Abundance values from different samples and OTUs but havi [...]
+<p>Figure summarizes quantitatively the increased abundances of Bacteroides and Prevotella in the Enterotypes 1 and 2, respectively. Interestingly, a large relative abundance of Blautia was observed for Enterotype 3, but only from 454-pyrosequencing data sets, not the Illumina or Sanger datasets. This suggests the increased Blautia might actually be an artifact. Similarly, Prevotella appears to be one of the most abundant genera in the Illumina-sequenced samples among Enterotype 3, but t [...]
+</div>
+</div>
+<div id="exploratory-analysis-and-graphics" class="section level1">
+<h1><span class="header-section-number">6</span> Exploratory analysis and graphics</h1>
+<div id="exploratory-heat-map" class="section level2">
+<h2><span class="header-section-number">6.1</span> Exploratory Heat Map</h2>
+<p>For further details, see the <a href="http://joey711.github.io/phyloseq/plot_heatmap-examples.html">plot_heatmap tutorial</a></p>
+<p>As the number of taxa in a dataset gets very large, the ability to effectively display all of the elements of the data becomes compromised, and a heatmap representation is no exception. It can also be time-consuming to render. To address both these issues, we show an example in which we have subsetted the Global Patterns dataset to a manageable portion, in this case, the Crenarchaeota phylum.</p>
+<pre class="r"><code>data("GlobalPatterns")
+gpac <- subset_taxa(GlobalPatterns, Phylum=="Crenarchaeota")
+(p <- plot_heatmap(gpac, "NMDS", "bray", "SampleType", "Family"))</code></pre>
+<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAA2AAAAKgCAIAAAByb4nzAAAACXBIWXMAAA7DAAAOwwHHb6hkAAAgAElEQVR4nOzde3wU9b0//s/sNdnsZnfdbAIETIQUjAlNw0WNKEFuEiEgMUdKtWKtKO05dquo7cNLqx5b2mNri1YPP/hWjdVikCYI8YQCCogY1HCxROMNCCQhCWTJJpvNZq/z+2PMNJLIJXnPzu7O6/nwj09mZ995Z8zlzczn/flwPM8zAAAAAIA+KrkTAAAAAIDoggIRAAAAAL4BBSIAAAAAfAMKRAAAAAD4BhSIAAAAAPANKBABAAAA4Bs0cicgv2efffbkyZMXcibP86FQSK1Wcxw3/M8bCoUYY2q1evihaBMLBoMcx5EkFg6Hw+GwRkPzbRYMBlUqlUpF8K8aJHaxyBMj/DnieZ4kMYX8gEdtYvjNc7GQ2KDuu+++t [...]
+<p>What if you wanted to change the axis labels?</p>
+<pre class="r"><code>p$scales$scales[[1]]$name <- "My X-Axis"
+p$scales$scales[[2]]$name <- "My Y-Axis"
+print(p)</code></pre>
+<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAA2AAAAKgCAIAAAByb4nzAAAACXBIWXMAAA7DAAAOwwHHb6hkAAAgAElEQVR4nOzdfVxUdd4//s+ZWxjm1pkBlQxUVjIwLlDbJU3MGxIVDeKbubXaVZnWte7srtW1V9pu7a/W9jJ319paV68s2goxE29ocdVSMsUS7xLDvAUBAWVgYBiGGWbm/P44MUuKd/A5c2bmvJ6P/vhw5sybNydu3p7zeX8+DMuyBAAAAACgm0ToBAAAAAAguKBABAAAAIAfQIEIAAAAAD+AAhEAAAAAfgAFIgAAAAD8AApEAAAAAPgBmdAJCO+NN964ePHizZzJsqzX65VKpQzD9P/zer1eQohUKu1/KLqJeTwehmGoJObz+Xw+n0xG59vM4/FIJBKJhMK/apDYraKeGMWfI5ZlqSQmkh/woE0Mv3luFRLr1a9+9auYm [...]
+<p>Note that it is possible to order the sample/species indices by any of the ordination methods supported in the <code>ordinate</code> function; and also that the color scheme can be modified with additional arguments.</p>
+<p>Heat map representation of the Crenarchaeota phylum abundance pattern across different sample types in the Global Patterns dataset.</p>
+</div>
+<div id="microbiome-network-representation" class="section level2">
+<h2><span class="header-section-number">6.2</span> Microbiome Network Representation</h2>
+<p>For further details, see the <a href="http://joey711.github.io/phyloseq/plot_network-examples.html">plot_network tutorial</a></p>
+<p>Continuing with the <code>enterotype</code> dataset, here are some examples for creating a custom network representation of the relationship between microbiome samples in an experiment. This relies heavily on the igraph and ggplot2 packages to create a network display of the “connectedness” of samples according to some user-provided ecological similarity. By default, points represent microbiom samples, and are determined using an algorithm that optimizes the clarity of the display of [...]
+<p>In this example, the default dissimilarity index was used (Jaccard, co-occurrence), with a maximum distance of <code>0.3</code> required to create an edge. Any function that can operate on phyloseq-objects and return a sample-wise distance can be provided as the <code>dist.fun</code> argument, or a character string of the name of the distance function already supported in phyloseq. Other distances may result in very different clustering, and this is a choice that should be understood [...]
+<p>Interestingly, at this level of analysis and parameter-settings the two major sub-graphs appear to be best explained by the sequencing technology and not the subject enterotype, suggesting that the choice of sequencing technology has a major effect on the microbial community one can observe. This seems to differ somewhat with the inferences described in the “enterotype” article (<a href="http://www.nature.com/nature/journal/v473/n7346/full/nature09944.html">Arumugam 2011</a>). However [...]
+<pre class="r"><code>data(enterotype)
+plot_net(enterotype, maxdist=0.4, color="SeqTech", shape="Enterotype")</code></pre>
+<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAABCAAAAKgCAIAAADF7pvQAAAACXBIWXMAAA7DAAAOwwHHb6hkAAAgAElEQVR4nOzdeUATZ/4/8GdCIBBOUQS5BUFB8cKLw4qAioIHora2ntVurd0edrvbdtt+7c92u7W3bbdqvVutVltBRRFQQbkURUQhcl9y35CEhIRkfn/MLssiQoiBgfB+/VOYzDP5xCLOe56LommaAAAAAAAAaAKH7QIAAAAAAEB7IGAAAAAAAIDGIGAAAAAAAIDGIGAAAAAAAIDGIGAAAAAAAIDGIGAAAAAAAIDGIGAAAAAAAIDGIGAAAAAAAIDGIGAAAAAAAIDGIGAAAAAAAIDGIGAAAAAAAIDGIGAAAAAAAIDGIGAAAAAAAIDGIGAAAAAAAIDGIGAAAAAAAIDGIGAAAAAAAIDGIGAAAAAAAIDGIGAAAAAAAIDGIGAAA [...]
+<p>Network representation of the relationship between microbiome samples in the “Enterotype” dataset (<a href="http://www.nature.com/nature/journal/v473/n7346/full/nature09944.html">Arumugam 2011</a>).</p>
+</div>
+<div id="ordination-methods" class="section level2">
+<h2><span class="header-section-number">6.3</span> Ordination Methods</h2>
+<p>For further details, see the <a href="http://joey711.github.io/phyloseq/plot_ordination-examples.html">plot_ordination tutorial</a></p>
+<p>Ordination methods can be a useful tool for exploring complex phylogenetic sequencing data, particularly when the hypothesized structure of the data is poorly defined (or there isn’t a hypothesis). The phyloseq package provides some useful tools for performing ordinations and plotting their results, via the <code>ordinate() and</code>plot_ordination() functions, respectively. Although there are many options and methods supported, a first-step will probably look something like the foll [...]
+<pre class="r"><code>my.physeq <- import("Biom", BIOMfilename="myBiomFile.biom")
+my.ord <- ordinate(my.physeq)
+plot_ordination(my.physeq, my.ord, color="myFavoriteVarible")</code></pre>
+<p>It is probably a good idea to read the documentation for these two functions, as they also provide links to related functions and additional examples you can try immediately on your own machine.</p>
+<pre class="r"><code>help(import)
+help(ordinate)
+help(distance)
+help(plot_ordination)</code></pre>
+<div id="principal-coordinates-analysis-pcoa" class="section level3">
+<h3><span class="header-section-number">6.3.1</span> Principal Coordinates Analysis (PCoA)</h3>
+<p>We take as our first example, a reproduction of Figure 5 from the “Global Patterns” article(<a href="http://www.pnas.org/content/early/2010/06/02/1000080107">Caporaso 2011</a>). The authors show a 3-dimensional representation of the first three axes of a Principal Coordinates Analysis (PCoA; This is also sometimes referred to as “Multi-Dimensional Scaling”, or “MDS”) performed on the unweighted-UniFrac distance using all of the available sequences (their approach included both 5’ and [...]
+<p>The following reproduces the unweighted UniFrac distance calculation on the full dataset. Note that this calculation can take a long time because of the large number of OTUs. Parallelization is recommended for large datasets, typically if they are as large as <a href="http://www.pnas.org/content/early/2010/06/02/1000080107">Global Patterns</a>, or larger. For details on parallelization, see the details section and examples in the <code>UniFrac()</code> documentation, and also the page [...]
+<p><a href="http://joey711.github.io/phyloseq-demo/unifrac.html" class="uri">http://joey711.github.io/phyloseq-demo/unifrac.html</a></p>
+<pre class="r"><code>data(GlobalPatterns)</code></pre>
+<pre class="r"><code>GPUF <- UniFrac(GlobalPatterns)</code></pre>
+<p>Load the pre-computed distance matrix, <code>GPUF</code></p>
+<pre class="r"><code>load(system.file("doc", "Unweighted_UniFrac.RData", package="phyloseq"))</code></pre>
+<p>Calculate the PCoA on this distance matrix, <code>GPUF</code>.</p>
+<pre class="r"><code>GloPa.pcoa = ordinate(GlobalPatterns, method="PCoA", distance=GPUF)</code></pre>
+<p>Before we look at the results, let’s first investigate how much of the total distance structure we will capture in the first few axes. We can do this graphically with a “scree plot”, an ordered barplot of the relative fraction of the total eigenvalues associated with each axis.</p>
+<pre class="r"><code>plot_scree(GloPa.pcoa, "Scree plot for Global Patterns, UniFrac/PCoA")</code></pre>
+<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAkAAAAGACAMAAAByRC0tAAAC+lBMVEUAAAABAQECAgIDAwMEBAQFBQUGBgYHBwcICAgJCQkKCgoLCwsMDAwNDQ0ODg4PDw8QEBARERESEhITExMUFBQVFRUWFhYXFxcYGBgZGRkaGhobGxscHBwdHR0eHh4fHx8gICAhISEiIiIjIyMkJCQlJSUnJycoKCgpKSkqKiorKyssLCwtLS0uLi4vLy8wMDAxMTEyMjIzMzM0NDQ1NTU2NjY3Nzc4ODg5OTk6Ojo7Ozs8PDw9PT0+Pj4/Pz9AQEBBQUFCQkJDQ0NERERFRUVGRkZHR0dISEhJSUlKSkpLS0tMTExNTU1OTk5PT09QUFBRUVFSUlJTU1NUVFRVVVVWVlZXV1dYWFhZWVlaWlpbW1tcXFxdXV1eXl5fX19gYGBhYWFiYmJjY2NkZGRlZ [...]
+<p>Scree plot of the PCoA used to create Figure 5 from the “Global Patterns” article(<a href="http://www.pnas.org/content/early/2010/06/02/1000080107">Caporaso 2011</a>). The first three axes represent 43% of the total variation in the distances. Interestingly, the fourth axis represents another 9%, and so may warrant exploration as well. A scree plot is an important tool for any ordination method, as the relative importance of axes can vary widely from one dataset to another.</p>
+<p>Next, we will reproduce Figure 5 from the “Global Patterns” article(<a href="http://www.pnas.org/content/early/2010/06/02/1000080107">Caporaso 2011</a>), but separating the three axes into 2 plots using <code>plot_ordination()</code>.</p>
+<pre class="r"><code>(p12 <- plot_ordination(GlobalPatterns, GloPa.pcoa, "samples", color="SampleType") +
+ geom_point(size=5) + geom_path() + scale_colour_hue(guide = FALSE) )</code></pre>
+<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAA2AAAAKgCAIAAAByb4nzAAAACXBIWXMAAA7DAAAOwwHHb6hkAAAgAElEQVR4nOzdd3xT5eLH8eckadK06W5auqBllVGGlLJRXIgKiANwcpWr4ARRERX14lVUFAf36pXhwg0uRP1xFVAU2WUXWnYn3btN0zTJ+f1RLxLoSEpGx+f9Vzjnycn3FZr02zOeI8myLAAAAID/UXg6AAAAAFoXCiIAAABsUBABAABgg4IIAAAAGxREAAAA2KAgAgAAwIbK0wEc8K9//ev06dP1j81ms0KhUCgouC1ktVolSZIkydNB2iqr1Wq1WlWqtvQJam0sFotSqfR0ijaMr8ELxNfgBbJYLEIIPsUXopV8DV5//fVDhw49Z2Fb+vV24sSJJUuW1D8uLi728fHRarWejdR2GY1GSZI0Go2ng7RVBoPBaDQGBwd7O [...]
+<pre class="r"><code>(p13 <- plot_ordination(GlobalPatterns, GloPa.pcoa, "samples", axes=c(1, 3),
+ color="SampleType") + geom_line() + geom_point(size=5) )</code></pre>
+<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAA2AAAAKgCAIAAAByb4nzAAAACXBIWXMAAA7DAAAOwwHHb6hkAAAgAElEQVR4nOzdd2BT5foH8Peck9E2oyuddFFklt2KiMheMhwoU8SFExRwe/nhFXBe9SqIIuqVKyoq4kaUjexZChTKSim00N2k2eOM3x/xIqO0SZvkZHw/f4WcJ+c8CTknT9/zDkoQBAIAAAAA8D+02AkAAAAAQGBBgQgAAAAAl0GBCAAAAACXQYEIAAAAAJdBgQgAAAAAl0GBCAAAAACXkYidgAcWLVp04cIFQgjLsjRN03T4VrccxzEMI3YWohEEwfUJUBQldi6iCfPvAM/zPM9LJMF0BfMuQRB4ng/n7wDHcYIghPN3gOd5iqL8dhl8+OGHs7Oz/XMsCATBdGpptdqFCxcSQnQ6nUwmUygUYmckGqPRqFQqw7Y84nm+r [...]
+</div>
+<div id="non-metric-multi-dimensional-scaling-nmds" class="section level3">
+<h3><span class="header-section-number">6.3.2</span> non-metric Multi-Dimensional Scaling (NMDS)</h3>
+<p>We repeat the previous example, but instead using non-metric multidimensional scaling (NMDS) limited to just two dimensions. This approach limits the amount of residual distance “not shown” in the first two (or three) axes, but forefeits some mathematical properties and does not always converge within the specified number of axes.</p>
+<pre class="r"><code># (Re)load UniFrac distance matrix and GlobalPatterns data
+data(GlobalPatterns)
+load(system.file("doc", "Unweighted_UniFrac.RData", package="phyloseq"))
+# perform NMDS, set to 2 axes
+GP.NMDS <- ordinate(GlobalPatterns, "NMDS", GPUF)</code></pre>
+<pre><code>## Run 0 stress 0.1432774
+## Run 1 stress 0.167021
+## Run 2 stress 0.3869407
+## Run 3 stress 0.1432774
+## ... Procrustes: rmse 4.380552e-05 max resid 0.0001434423
+## ... Similar to previous best
+## Run 4 stress 0.1432625
+## ... New best solution
+## ... Procrustes: rmse 0.00349315 max resid 0.01262722
+## Run 5 stress 0.167021
+## Run 6 stress 0.2426281
+## Run 7 stress 0.1856304
+## Run 8 stress 0.2480425
+## Run 9 stress 0.1432774
+## ... Procrustes: rmse 0.00348014 max resid 0.01258164
+## Run 10 stress 0.1432625
+## ... Procrustes: rmse 4.133989e-06 max resid 1.146816e-05
+## ... Similar to previous best
+## Run 11 stress 0.1432774
+## ... Procrustes: rmse 0.003484347 max resid 0.01259783
+## Run 12 stress 0.1432625
+## ... Procrustes: rmse 2.941188e-06 max resid 1.220155e-05
+## ... Similar to previous best
+## Run 13 stress 0.1432774
+## ... Procrustes: rmse 0.003526331 max resid 0.01273739
+## Run 14 stress 0.1432625
+## ... New best solution
+## ... Procrustes: rmse 1.53826e-06 max resid 4.839912e-06
+## ... Similar to previous best
+## Run 15 stress 0.1432625
+## ... Procrustes: rmse 1.985561e-06 max resid 5.926334e-06
+## ... Similar to previous best
+## Run 16 stress 0.1432774
+## ... Procrustes: rmse 0.003480274 max resid 0.01258406
+## Run 17 stress 0.1432625
+## ... Procrustes: rmse 1.590998e-06 max resid 3.967232e-06
+## ... Similar to previous best
+## Run 18 stress 0.167021
+## Run 19 stress 0.2156369
+## Run 20 stress 0.1432774
+## ... Procrustes: rmse 0.003480002 max resid 0.01258237
+## *** Solution reached</code></pre>
+<pre class="r"><code>(p <- plot_ordination(GlobalPatterns, GP.NMDS, "samples", color="SampleType") +
+ geom_line() + geom_point(size=5) )</code></pre>
+<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAA2AAAAKgCAIAAAByb4nzAAAACXBIWXMAAA7DAAAOwwHHb6hkAAAgAElEQVR4nOzdd1wUd/4/8M/s7C5lqVLFQhVFVERIYiFiC0bBaIw9xmhiEtM0luQuXy+/RI0pF3PGaC5qqrFFjZ41BluMFQuIAiIKgmKlLltgy5TfH+txoAvswvZ9Pf/IY5l578wbpLwyM5/Ph+J5ngAAAAAA/JfA2g0AAAAAgG1BQAQAAACARhAQAQAAAKARBEQAAAAAaAQBEQAAAAAaQUAEAAAAgEaE1m6gBV9//fWdO3es3YVxeJ5nWZamaYqirN2L7dJ9iazdhe3iOI7jOKHQ1n9CrYjneY7j8F3UDJZleZ7Hd1EzOI4jhAgEuFbSJIZhKIqiafrVV1+NiIiwdjtgObb+i6OoqGjFihXW7sI4DMNIpVJvb2+RSGTtX [...]
+<p>The figure nicely shows the relative dissimilarities between microbial communities from different habitats. However, it fails to indicate what was different between the communities. For an ordination method that provides information on the taxa that explain differences between samples (or groups of samples), we use Correspondence Analysis.</p>
+</div>
+<div id="correspondence-analysis-ca" class="section level3">
+<h3><span class="header-section-number">6.3.3</span> Correspondence Analysis (CA)</h3>
+<p>In the following section we will show continue our exploration of the “GlobalPatterns” dataset using various features of an ordination method called Correspondence Analysis. We give special emphasis to exploratory interpretations using the biplot, because it provides additional information that is not available from PCoA or NMDS.</p>
+<p>Let’s start by performing a Correspondence Analysis and investigating the scree plot. Both interestingly and challengingly, the scree plot suggests that the <a href="http://www.pnas.org/content/early/2010/06/02/1000080107">Global Patterns</a> abundance data is quite high-dimensional, with the first two CA axes accounting for not quite 17% of the total (chi-square) variability. Note the absence of a steep decline in eigenvalue fraction as axis number increases. Each additional axis rep [...]
+<p>First, let’s severely subset the number of species for the sake of run-time.\footnote{This is for illustration purposes only, do not repeat unless you are very sure you have a good reason for doing this.</p>
+<pre class="r"><code>data(GlobalPatterns)
+# Take a subset of the GP dataset, top 200 species
+topsp <- names(sort(taxa_sums(GlobalPatterns), TRUE)[1:200])
+GP <- prune_taxa(topsp, GlobalPatterns)
+# Subset further to top 5 phyla, among the top 200 OTUs.
+top5ph <- sort(tapply(taxa_sums(GP), tax_table(GP)[, "Phylum"], sum), decreasing=TRUE)[1:5]
+GP <- subset_taxa(GP, Phylum %in% names(top5ph))
+# Re-add human variable to sample data:
+sample_data(GP)$human <- factor(human)</code></pre>
+<p>Now perform the correspondence analysis.</p>
+<pre class="r"><code># Now perform a unconstrained correspondence analysis
+gpca <- ordinate(GP, "CCA")
+# Scree plot
+plot_scree(gpca, "Scree Plot for Global Patterns Correspondence Analysis")</code></pre>
+<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAwAAAAHgCAMAAAAlhPoXAAAC91BMVEUAAAABAQECAgIDAwMEBAQFBQUGBgYHBwcICAgJCQkKCgoLCwsMDAwNDQ0ODg4PDw8QEBARERESEhITExMUFBQVFRUWFhYXFxcYGBgZGRkaGhobGxscHBwdHR0eHh4fHx8gICAhISEiIiIjIyMkJCQnJycoKCgpKSkqKiorKyssLCwtLS0uLi4vLy8wMDAxMTEyMjIzMzM0NDQ1NTU2NjY3Nzc4ODg5OTk6Ojo7Ozs8PDw9PT0+Pj4/Pz9AQEBBQUFCQkJDQ0NERERFRUVGRkZHR0dISEhJSUlKSkpLS0tMTExNTU1OTk5PT09QUFBRUVFSUlJTU1NUVFRVVVVWVlZXV1dYWFhZWVlaWlpbW1tcXFxdXV1eXl5fX19gYGBhYWFiYmJjY2NkZGRlZWVmZ [...]
+<p>Now let’s investigate how the samples behave on the first few CA axes.</p>
+<pre class="r"><code>(p12 <- plot_ordination(GP, gpca, "samples", color="SampleType") +
+ geom_line() + geom_point(size=5) )</code></pre>
+<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAA2AAAAKgCAIAAAByb4nzAAAACXBIWXMAAA7DAAAOwwHHb6hkAAAgAElEQVR4nOzdeVxUZfs/8PvMAsPMwLCDiCwukOIKaIrkrqRiLomYWllulaalZU/ZpqZtVpqW2renfqZpSpbmFua+IGmCKGooCIqo7DD7es7vj+khRweYgWGWw+f9R6/hzHXOuW6J4cNZ7kMxDEMAAAAAAP6H4+gGAAAAAMC5ICACAAAAgAkERAAAAAAwgYAIAAAAACYQEAEAAADABAIiAAAAAJjgOboBE19++eWdO3csLNbr9RwOh8NhQ8Y1GAxcLtfRXdiAwWBgGIbHc67/r5qGNd8UhmEMBgM7vikMwzAMw46fenyCOSGDwUAIYc1YbDuQ2bNnt2/f3oYbBCfnXL8zCgoK1qxZY2FxVVWVQCAQCoUt2pJ9SKVSLy8vR [...]
+<pre class="r"><code>(p34 <- plot_ordination(GP, gpca, "samples", axes=c(3, 4), color="SampleType") +
+ geom_line() + geom_point(size=5) )</code></pre>
+<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAA2AAAAKgCAIAAAByb4nzAAAACXBIWXMAAA7DAAAOwwHHb6hkAAAgAElEQVR4nOzdeVxU9f4/8PeZYWaAGfZVdlwg9w0LjdyVVMwyFbcsSyuzsrLlW/3qpqbVza55s6t263otc82upWWYS6khroiiRoKyiCA7szDrOef3xxgBDjADwwwz83o++mM453M+n/dhTscXZ2V4nicAAAAAgD8J7F0AAAAAAHQuCIgAAAAA0AgCIgAAAAA0goAIAAAAAI0gIAIAAABAIwiIAAAAANCIm70LaOSf//znzZs37V2FdXAcx3Gcm1vn+g13EJ7neZ4XCFzi7w2WZXmed5FvluM4hmEYhrF3IbZgMBgEAoHrbMZCodDeVdgCz/PGlXWRzbjjvtknn3yya9euHdEzdE6d6x+5vLy8tWvX2rsK69BoNEqlMiAgw [...]
+<p>A clear feature of these plots is that the feces and mock communities cluster tightly together, far away from all other samples on the first axis (CA1). The skin and tongue samples separate similarly, but on the second axis. Taken together, it appears that the first two axes are best explained by the separation of human-associated “environments” from the other non-human environments in the dataset, with a secondary separation of tongue and skin samples from feces.</p>
+<p>We will now investigate further this top-level structure of the data, using an additional feature of correspondence analysis that allows us to compare the relative contributions of individual taxa on the same graphical space: the “biplot”. However, because we just displayed the position of samples in the ordination and there are often many thousands of OTUs, we will focus on creating an interpretable plot of the OTUs. For creating graphics that combine the two plots, try the <code>&qu [...]
+<pre class="r"><code>p1 <- plot_ordination(GP, gpca, "species", color="Phylum")
+(p1 <- ggplot(p1$data, p1$mapping) + geom_point(size=5, alpha=0.5) +
+ facet_wrap(~Phylum) + scale_colour_hue(guide = FALSE) )</code></pre>
+<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAA2AAAAKgCAIAAAByb4nzAAAACXBIWXMAAA7DAAAOwwHHb6hkAAAgAElEQVR4nOzdeXwV9b0//vdnlrNlIyEbgbAIBBAUBBdEq+JSFaVaLf647VWua9Wv1uqlrW29tbUbLtV7S12r7VWwSq1axKUVlWtbV2QRFBCQAAGSQPbkLLN8Pp/fHxMOmSTnJJA5OSfk9Xz4eJgz55zPvOfNZ2beZ5bPMCklAQAAAAAcpKQ7AAAAAADILCgQAQAAAMAFBSIAAAAAuKBABAAAAAAXFIgAAAAA4IICEQAAAABctHQHQIsWLaqpqUl3FACZKBAIfOc73ykrK+s4cc+ePQ888EC6QgLIcGPHjr311ls7TXz22WdXr16dlngAMt83v/nNk08+udPE9BeIO3bsWLhwYbqjAMg427Ztu/vuu//jP/6jU4HY0tJSU [...]
+<p>Species plot of the “Global Patterns” correspondence analysis first two axes, with each phylum on a different panel (“facet”). Only the most abundant 5 phyla among the most abundant 200 taxa (cumulative, all samples) are included. Arbitrary reduction, for computational efficiency of example.</p>
+<p>Let’s try drawing the figure again, only this time summarizing the species points as a 2D density estimate, without any individual points.</p>
+<pre class="r"><code>(p3 <- ggplot(p1$data, p1$mapping) + geom_density2d() +
+ facet_wrap(~Phylum) + scale_colour_hue(guide = FALSE) )</code></pre>
+<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAA2AAAAKgCAIAAAByb4nzAAAACXBIWXMAAA7DAAAOwwHHb6hkAAAgAElEQVR4nOzdd2Ab5f0w8OeGpNMeliV5bztxEsexs/ckEDYUCmWUUkZLSwu84UdbaKEU2jAKlFlKoRAIG8JeWSRkDyexEyfeW5K1193p9vuHgrEl2XEcO3bM8/kruXvuuUePT9JXz0QkSQIQBEEQBEEQ9D10tAsAQRAEQRAEjS0wQIQgCIIgCIL6gAEiBEEQBEEQ1AcMECEIgiAIgqA+YIAIQRAEQRAE9QEDRAiCIAiCIKgPfLQLANasWeN0Oke7FBA0FhEE8bvf/S49Pb33wc7Ozscff3y0igRBY1xBQcHtt98ed3DdunX79u0blfJA0Nj3s5/9bObMmXEHRz9AbG5uXr169WiXAoLGnIaGhvvvv/+GG26ICxBDoZDVa [...]
+<p>These figures reveal some useful patterns and interesting outliers, but what if we want a complete summary of how each phylum is represented along each axis? The following code is a way to show this using boxplots, while still avoiding the occlusion problem (points layered on top of each other), and also conveying some useful information about the pattern of taxa that contribute to the separation of human-associated samples from the other sample types. It re-uses the data that was sto [...]
+<pre class="r"><code>library("reshape2")
+# Melt the species-data.frame, DF, to facet each CA axis separately
+mdf <- melt(p1$data[, c("CA1", "CA2", "Phylum", "Family", "Genus")],
+ id=c("Phylum", "Family", "Genus") )
+# Select some special outliers for labelling
+LF <- subset(mdf, variable=="CA2" & value < -1.0)
+# build plot: boxplot summaries of each CA-axis, with labels
+p <- ggplot(mdf, aes(Phylum, value, color=Phylum)) +
+ geom_boxplot() +
+ facet_wrap(~variable, 2) +
+ scale_colour_hue(guide = FALSE) +
+ theme_bw() +
+ theme( axis.text.x = element_text(angle = -90, vjust = 0.5) )
+# Add the text label layer, and render ggplot graphic
+(p <- p + geom_text(data=subset(LF, !is.na(Family)),
+ mapping = aes(Phylum, value+0.1, color=Phylum, label=Family),
+ vjust=0,
+ size=2))</code></pre>
+<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAA2AAAAKgCAIAAAByb4nzAAAACXBIWXMAAA7DAAAOwwHHb6hkAAAgAElEQVR4nOzdfXwU5b3//881M7ub7CbkhoRbuResgKDGilpRsGjx5mC1La1Vod62tOeI9ejXfn/Wb09Pv1SPbcUe2h6/3tRabbVarRUrihZKW8TKAVRQQFRAQKOEJJvNzd7MzPX7YzkxEyAJuLuzu3k9/+AxmZ3d68PsXDvvveZmldZaAAAAgP9h+F0AAAAA8gsBEQAAAB4ERAAAAHgQEAEAAOBBQAQAAIAHAREAAAAelt8FyO23315fX+93FQAAAP3RJZdcMm3atG4z/Q+I77777o033uh3FQAAAP2L67qXXXbZ9OnTDwyIHGIGAACABwERAAAAHgREAAAAeBAQAQAA4EFABAAAgAcBEQAAAB4ERADoieM4v/3tb6+++ [...]
+<p>One way to relate some of the high-level patterns we observed from correspondence analysis is to directly visualize the abundances in an organized, quantitative way, to see if this does in fact support / explain the human/environment microbiome differences. Here is an example using the <code>plot_bar</code> function described in an earlier section.</p>
+<pre class="r"><code>plot_bar(GP, x="human", fill="SampleType", facet_grid= ~ Phylum)</code></pre>
+<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAA2AAAAKgCAIAAAByb4nzAAAACXBIWXMAAA7DAAAOwwHHb6hkAAAgAElEQVR4nOzdeVxU1fsH8GeAYRtEUBACQhAVyQ0RS41cERUxkb6uqZWClZYLLpVrgFuKSykuFZmkQpqVeyTueyaCklsmau6igMDsM/f3x/01MjCAwlzuZebzftXrNd6Ze85zD2dmnrnn3nNEDMMQAAAAAMB/LPgOAAAAAACEBQkiAAAAAOhBgggAAAAAepAgAgAAAIAeJIgAAAAAoAcJIgAAAADoseI7AKNZtGjR/fv3+Y7C9Nna2n7yySfOzs6lN169enX16tV8hWRWXn311eHDh5fZ+OWXX+bm5vISj1mxtbWdMGGCh4dH6Y23bt1atmwZXyGZlTZt2owePbrMxnXr1l26dImXeMzNhAkTmjRpwncUUHtMJ0G8fv361 [...]
+<p>In this figure we’ve used the <code>threshold</code> parameter to omit all but phyla accounting for the top 90% of phyla in any one sample. Some patterns emerging from this display appear to be: (1) Cyanobacteria, Actinobacteria appear under-represented in human samples; (2) conversely, Firmicutes appear over-represented in human samples; (3) Acidobacteria, Verrucomicrobia appear over-represented in the fecal samples; (4) the only Crenarchaeota were observed in the Mock sample, which [...]
+</div>
+<div id="double-principle-coordinate-analysis-dpcoa" class="section level3">
+<h3><span class="header-section-number">6.3.4</span> Double Principle Coordinate Analysis (DPCoA)</h3>
+<p>Here is a quick example illustrating the use of Double Principal Coordinate Analysis (DPCoA~\cite{Pavoine2004523), using the using the <code>ordinate()</code> function in phyloseq, as well as the “biplot” option for `plot_ordination(). For a description that includes an applied example using the “enterotype” dataset and comparison with UniFrac/PCoA, see Fukuyama et al~\cite{fukuyama2012com.</p>
+<pre class="r"><code>GP.dpcoa <- ordinate(GP, "DPCoA")
+pdpcoa <- plot_ordination(GP, GP.dpcoa, type="biplot",
+ color="SampleType", shape="Phylum")
+shape.fac <- pdpcoa$data[, deparse(pdpcoa$mapping$shape)]
+man.shapes <- c(19, 21:25)
+names(man.shapes) <- c("Samples", levels(shape.fac)[levels(shape.fac)!="Samples"])
+p2dpcoa <- pdpcoa + scale_shape_manual(values=man.shapes)</code></pre>
+<p>A biplot representation of a Double Principal Coordinate Analysis (DPCoA), on a simplified version of the “Global Patterns” dataset with only the most abundant 200 OTUs included.</p>
+<pre class="r"><code># Show just Samples or just Taxa
+plot_ordination(GP, GP.dpcoa, type="taxa", shape="Phylum")</code></pre>
+<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAA2AAAAKgCAMAAADK0+6WAAAC/VBMVEUAAAABAQECAgIDAwMEBAQFBQUGBgYHBwcICAgJCQkKCgoLCwsMDAwNDQ0ODg4PDw8QEBARERESEhITExMUFBQVFRUWFhYXFxcYGBgZGRkaGhobGxscHBwdHR0eHh4fHx8gICAhISEiIiIjIyMkJCQlJSUmJiYnJycoKCgpKSkqKiorKyssLCwtLS0uLi4vLy8wMDAxMTEyMjIzMzM0NDQ1NTU2NjY3Nzc4ODg5OTk6Ojo7Ozs8PDw9PT0+Pj4/Pz9AQEBBQUFCQkJDQ0NERERFRUVGRkZHR0dISEhJSUlKSkpLS0tMTExNTU1OTk5PT09QUFBRUVFSUlJTU1NUVFRVVVVWVlZXV1dYWFhZWVlaWlpbW1tcXFxdXV1eXl5fX19gYGBhYWFiYmJjY2NkZ [...]
+<pre class="r"><code>plot_ordination(GP, GP.dpcoa, type="samples", color="SampleType")</code></pre>
+<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAA2AAAAKgCAIAAAByb4nzAAAACXBIWXMAAA7DAAAOwwHHb6hkAAAgAElEQVR4nOzdeVyU5f7/8WsYRpTdhFTcIBVzRQXNhdRcIBUzOSounMoyyzRNzc5Jv3bKcqlosTxfl++xjuZuiylmuB8xwRIRJQ2FxC03FBiGZZi55/79cXf4cSMqKMwt4+v5R497rnv7XMBcvb1XnSzLAgAAAPgvJ60LAAAAwP2FgAgAAAAVAiIAAABUCIgAAABQISACAABAhYAIAAAAFWetC6iEzz777I8//tC6igqRJEmv12tdRdWTJEkI4Xhdk2VZlmUnJ0f795Isy8qfok6n07qWKsZXrMZxyF+ZA3/FbDabTqcr3a8JEyY88sgjGpYEO6tJATEjI2PRokVaV1EhRqPR09NT6yqqXk5Ojl6v9/Dw0LqQKiZJUlFRk [...]
+<pre class="r"><code># Split
+plot_ordination(GP, GP.dpcoa, type="split",
+ color="SampleType", shape="Phylum") +
+ ggplot2::scale_colour_discrete()</code></pre>
+<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAA2AAAAKgCAIAAAByb4nzAAAACXBIWXMAAA7DAAAOwwHHb6hkAAAgAElEQVR4nOzdeVgT59438DskYd8FQVREUVGxFQE3iooiUMWlUDaBWq2odamIWmyPx7biUq3UDfqorT2K4q5HbfH4RsWqoFgXkAO1omDxAFUUFENIyDbz/jHPyUMgULZMAL+fq1evySz3/ZthBr7ek0w4NE0TAAAAAID/0tN1AQAAAADQsSAgAgAAAIAaBEQAAAAAUIOACAAAAABqEBABAAAAQA0CIgAAAACo4em6gBbYtGnTs2fPdF0FALRAWFiYl5dXvZlxcXF4wBZA57J06dJ+/frpugpgT2cKiI8fP165cqWuqwCA5oqMjGyYDgkh1dXV8fHx7NcDAK1QWloaHx//wQcf6LoQYBVuMQMAAACAGgREAAAAAFCDgAgAA [...]
+</div>
+</div>
+<div id="distance-methods" class="section level2">
+<h2><span class="header-section-number">6.4</span> Distance Methods</h2>
+<div id="distance-central-distance-function" class="section level3">
+<h3><span class="header-section-number">6.4.1</span> distance(): Central Distance Function</h3>
+<p>Many comparisons of microbiome samples, including the graphical model and the PCoA analysis, require a calculation for the relative dissimilarity/distance between one microbial community and another. The phyloseq-package provides a general “wrapper” function for calculating ecological distance matrices between the samples in an experiment.</p>
+<p><code>distance()</code> currently supports 43 method options, as well as user-provided arbitrary methods via an interface to vegan’s <code>designdist()</code> function. Currrently only sample-wise distances are supported (the <code>type</code> argument), but eventually species-wise (OTU-wise) distances will be supported as well. In addition to supporting any of the method options to the three main distance functions of the vegan-package~\cite{veganpkg — including the 14 distances of t [...]
+<p>The function takes a <code>phyloseq-class</code> object and an argument indicating the distance type; and it returns a `dist-class distance matrix.</p>
+<pre class="r"><code>data(esophagus)
+distance(esophagus, "bray") </code></pre>
+<pre><code>## B C
+## C 0.4061135
+## D 0.4976303 0.5907173</code></pre>
+<pre class="r"><code>distance(esophagus, "wunifrac") # weighted UniFrac</code></pre>
+<pre><code>## B C
+## C 0.2035424
+## D 0.2603371 0.2477016</code></pre>
+<pre class="r"><code>distance(esophagus, "jaccard") # vegdist jaccard</code></pre>
+<pre><code>## B C
+## C 0.5776398
+## D 0.6645570 0.7427056</code></pre>
+<pre class="r"><code>distance(esophagus, "g") # betadiver method option "g"</code></pre>
+<pre><code>## B C
+## C 0.6136364
+## D 0.6250000 0.6078431</code></pre>
+</div>
+<div id="unifrac-and-weighted-unifrac" class="section level3">
+<h3><span class="header-section-number">6.4.2</span> UniFrac and weighted UniFrac</h3>
+<p>UniFrac is a recently-defined~\cite{Lozupone:2005gn and popular distance metric to summarize the difference between pairs of ecological communities. All UniFrac variants use a phylogenetic tree of the relationship among taxa as central information to calculating the distance between two samples/communities. An unweighted UniFrac distance matrix only considers the presence/absence of taxa, while weighted UniFrac accounts for the relative abundance of taxa as well as their phylogenetic [...]
+<p>The following is an example calculating the UniFrac distance (both weighted and unweighted) matrix using the “esophagus” example dataset:</p>
+<pre class="r"><code>data(esophagus)
+distance(esophagus, "wUniFrac")
+distance(esophagus, "uUniFrac")</code></pre>
+<p>See the phyloseq demo page about fast parallel UniFrac.</p>
+</div>
+</div>
+<div id="hierarchical-clustering" class="section level2">
+<h2><span class="header-section-number">6.5</span> Hierarchical Clustering</h2>
+<p>Another potentially useful and popular way to visualize/decompose sample-distance matrices is through hierarchical clustering (e.g. <code>hclust</code>). In the following example, we reproduce Figure~4 from the <a href="http://www.pnas.org/content/early/2010/06/02/1000080107">“Global Patterns” article</a>, using the unweighted UniFrac distance and the UPGMA method (<code>hclust</code> parameter <code>method="average"</code>). Try <code>help("hclust")</code> for alt [...]
+<pre class="r"><code># (Re)load UniFrac distance matrix and GlobalPatterns data
+data(GlobalPatterns)
+load(system.file("doc", "Unweighted_UniFrac.RData", package="phyloseq"))
+# Manually define color-shading vector based on sample type.
+colorScale <- rainbow(length(levels(get_variable(GlobalPatterns, "SampleType"))))
+cols <- colorScale[get_variable(GlobalPatterns, "SampleType")]
+GP.tip.labels <- as(get_variable(GlobalPatterns, "SampleType"), "character")
+# This is the actual hierarchical clustering call, specifying average-link clustering
+GP.hclust <- hclust(GPUF, method="average")
+plot(GP.hclust, col=cols)</code></pre>
+<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAA2AAAAKgCAIAAAByb4nzAAAACXBIWXMAAA7DAAAOwwHHb6hkAAAgAElEQVR4nOzdeVxU5f4H8M+wg+zIPuCCCKKIO6aZmrgQprmF5VK5ZWp1f5Wmt0wrzTa1e1st69bNm1ppZWRXEyxN00RRM8FAXJAdAdmXYc7vj+GOHASB4QznwHzer/4YzvI9X03xw3nO8xyVIAggIiIiIvofM7kbICIiIiJlYUAkIiIiIhEGRCIiIiISYUAkIiIiIhEGRCIiIiISYUAkIiIiIhEGRCIiIiISYUAkIiIiIhEGRCIiIiISYUAkIiIiIhEGRCIiIiISYUAkIiIiIhEGRCIiIiISYUAkIiIiIhEGRCIiIiISYUAkIiIiIhEGRCIiIiISYUAkIiIiIhEGRCIiIiISYUAkIiIiIhEGRCIiIiISYUAkIiIiIhEGR [...]
+<p>An alternative means of summarizing a distance matrix via hierarchical clustering and plotting as an annotated dendrogram. Compare with Figure 4 from the <a href="http://www.pnas.org/content/early/2010/06/02/1000080107">Global Patterns</a>). Some differences in Figure~\ref{fig:GPfig4 from the original article might be explained by <a href="http://www.pnas.org/content/early/2010/06/02/1000080107">Global Patterns</a> in phyloseq being the summed observations from both primer directions [...]
+</div>
+</div>
+<div id="multiple-testing-and-differential-abundance" class="section level1">
+<h1><span class="header-section-number">7</span> Multiple Testing and Differential Abundance</h1>
+<p>One of our recommended approaches to this problem was described in McMurdie and Holmes (2014) <a href="http://dx.plos.org/10.1371/journal.pcbi.1003531">Waste Not, Want Not: Why Rarefying Microbiome Data is Inadmissible</a>. PLoS Computational Biology. 10(4):e1003531</p>
+<p>Some reproducible demonstrations of this approach are included in <a href="http://joey711.github.io/phyloseq-extensions/extensions-index.html">the phyloseq extensions repository</a>, the <code>phyloseq_to_deseq2</code> function, as well as a separate vignetted dedicated to this topic (phyloseq and DESeq2 on Colorectal Cancer Data).</p>
+<p>Please make use of these materials for differential abundance testing.</p>
+</div>
+
+
+
+<!-- dynamically load mathjax for compatibility with self-contained -->
+<script>
+ (function () {
+ var script = document.createElement("script");
+ script.type = "text/javascript";
+ script.src = "https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
+ document.getElementsByTagName("head")[0].appendChild(script);
+ })();
+</script>
+
+</body>
+</html>
diff --git a/inst/doc/phyloseq-basics.R b/inst/doc/phyloseq-basics.R
new file mode 100644
index 0000000..dc905b5
--- /dev/null
+++ b/inst/doc/phyloseq-basics.R
@@ -0,0 +1,112 @@
+## ---- eval=FALSE---------------------------------------------------------
+# vignette("phyloseq_analysis")
+
+## ----load-packages, message=FALSE, warning=FALSE-------------------------
+library("phyloseq")
+
+## ---- eval=FALSE---------------------------------------------------------
+# myOTU1 <- import_RDP_cluster("path/to/my/filename.clust")
+
+## ---- eval=FALSE---------------------------------------------------------
+# data(GlobalPatterns)
+# data(esophagus)
+# data(enterotype)
+# data(soilrep)
+
+## ------------------------------------------------------------------------
+data(GlobalPatterns)
+GlobalPatterns
+
+## ---- eval=FALSE---------------------------------------------------------
+# otu1 <- otu_table(raw_abundance_matrix, taxa_are_rows=FALSE)
+# sam1 <- sample_data(raw_sample_data.frame)
+# tax1 <- tax_table(raw_taxonomy_matrix)
+# tre1 <- read_tree(my_tree_file)
+
+## ---- eval=FALSE---------------------------------------------------------
+# ex1b <- phyloseq(my_otu_table, my_sample_data, my_taxonomyTable, my_tree)
+
+## ---- eval=FALSE---------------------------------------------------------
+# ex1c <- phyloseq(my_otu_table, my_sample_data)
+
+## ----echo=FALSE----------------------------------------------------------
+topN <- 20
+
+## ------------------------------------------------------------------------
+data(GlobalPatterns)
+most_abundant_taxa <- sort(taxa_sums(GlobalPatterns), TRUE)[1:topN]
+ex2 <- prune_taxa(names(most_abundant_taxa), GlobalPatterns)
+
+## ------------------------------------------------------------------------
+topFamilies <- tax_table(ex2)[, "Family"]
+as(topFamilies, "vector")
+
+## ---- eval=FALSE---------------------------------------------------------
+# testOTU <- otu_table(matrix(sample(1:50, 25, replace=TRUE), 5, 5), taxa_are_rows=FALSE)
+# f1<- filterfun_sample(topk(2))
+# wh1 <- genefilter_sample(testOTU, f1, A=2)
+# wh2 <- c(T, T, T, F, F)
+# prune_taxa(wh1, testOTU)
+# prune_taxa(wh2, testOTU)
+
+## ------------------------------------------------------------------------
+data(GlobalPatterns)
+f1<- filterfun_sample(topp(0.1))
+wh1 <- genefilter_sample(GlobalPatterns, f1, A=(1/2*nsamples(GlobalPatterns)))
+sum(wh1)
+ex2 <- prune_taxa(wh1, GlobalPatterns)
+
+## ------------------------------------------------------------------------
+print(ex2)
+
+## ---- eval=FALSE---------------------------------------------------------
+# data(GlobalPatterns)
+# f1<- filterfun_sample(topf(0.9))
+# wh1 <- genefilter_sample(GlobalPatterns, f1, A=(1/3*nsamples(GlobalPatterns)))
+# sum(wh1)
+# prune_taxa(wh1, GlobalPatterns)
+
+## ------------------------------------------------------------------------
+data("enterotype")
+library("genefilter")
+flist<- filterfun(kOverA(5, 2e-05))
+ent.logi <- filter_taxa(enterotype, flist)
+ent.trim <- filter_taxa(enterotype, flist, TRUE)
+identical(ent.trim, prune_taxa(ent.logi, enterotype))
+identical(sum(ent.logi), ntaxa(ent.trim))
+filter_taxa(enterotype, flist, TRUE)
+
+## ------------------------------------------------------------------------
+ex3 <- subset_samples(GlobalPatterns, SampleType%in%c("Freshwater", "Ocean", "Freshwater (creek)"))
+ex3
+
+## ------------------------------------------------------------------------
+subset(sample_data(GlobalPatterns), SampleType%in%c("Freshwater", "Ocean", "Freshwater (creek)"))
+
+## ------------------------------------------------------------------------
+ex4 <- subset_taxa(GlobalPatterns, Phylum=="Firmicutes")
+ex4
+
+## ------------------------------------------------------------------------
+randomSpecies100 <- sample(taxa_names(GlobalPatterns), 100, replace=FALSE)
+ex5 <- prune_taxa(randomSpecies100, GlobalPatterns)
+
+## ---- eval=FALSE---------------------------------------------------------
+# data(GlobalPatterns)
+# ex2 <- transform_sample_counts(GlobalPatterns, I)
+
+## ------------------------------------------------------------------------
+ex4<- transform_sample_counts(GlobalPatterns, threshrankfun(500))
+
+## ---- eval=FALSE---------------------------------------------------------
+# ex6 <- tax_glom(GlobalPatterns, taxlevel="Genus")
+
+## ---- eval=FALSE---------------------------------------------------------
+# ex7 <- tip_glom(GlobalPatterns, speciationMinLength = 0.05)
+
+## ---- eval=FALSE---------------------------------------------------------
+# install.packages("doParallel")
+# install.packages("doMC")
+# install.packages("doSNOW")
+# install.packages("doMPI")
+
diff --git a/inst/doc/phyloseq-basics.Rmd b/inst/doc/phyloseq-basics.Rmd
new file mode 100644
index 0000000..a44421e
--- /dev/null
+++ b/inst/doc/phyloseq-basics.Rmd
@@ -0,0 +1,600 @@
+---
+title: "Basic storage, access, and manipulation of phylogenetic sequencing data with *phyloseq*"
+output:
+ BiocStyle::html_document:
+ fig_height: 7
+ fig_width: 10
+ toc: yes
+ toc_depth: 2
+ number_sections: true
+---
+<!--
+%% \VignetteEngine{knitr::rmarkdown}
+%% \VignetteIndexEntry{phyloseq basics vignette}
+-->
+
+`r library("knitr")`
+`r opts_chunk$set(cache=FALSE, fig.width=9, message=FALSE, warning=FALSE)`
+
+Paul J. McMurdie and Susan Holmes
+
+<mcmurdie at stanford.edu>
+
+[phyloseq Home Page](http://joey711.github.io/phyloseq/)
+
+If you find phyloseq and/or its tutorials useful, please acknowledge and cite phyloseq in your publications:
+
+**phyloseq: An R package for reproducible interactive analysis and graphics of microbiome census data** (2013) PLoS ONE 8(4):e61217
+http://dx.plos.org/10.1371/journal.pone.0061217
+
+## Other resources
+The phyloseq project also has a number of supporting online resources, most of which can by found at [the phyloseq home page](http://joey711.github.com/phyloseq/), or from the phyloseq stable release [page on Bioconductor](http://bioconductor.org/packages/release/bioc/html/phyloseq.html).
+
+To post feature requests or ask for help, try [the phyloseq Issue Tracker](https://github.com/joey711/phyloseq/issues).
+
+
+# Introduction
+
+The analysis of microbiological communities brings many challenges: the integration of many different types of data with methods from ecology, genetics, phylogenetics, network analysis, visualization and testing. The data itself may originate from widely different sources, such as the microbiomes of humans, soils, surface and ocean waters, wastewater treatment plants, industrial facilities, and so on; and as a result, these varied sample types may have very different forms and scales of [...]
+
+
+# About this vignette
+
+## Typesetting Legend <a id="sec:typeset-legend"></a>
+
+- **bold** - Bold is used for emphasis.
+- *italics* - Italics are used for package names, and special words, phrases.
+- `code font` - The font for code, usually courrier-like,
+but depends on the theme.
+- `myFun()` - Code font word with `()` attached at the right-end,
+is a function name.
+- [Hyperlink](#sec:typeset-legend) - Hyperlinks are
+clickable text that will jump to sections and external pages.
+
+## Other links and tutorials
+
+An overview of phyloseq's intended functionality, goals, and design is provided
+in the following free and open access article:
+
+McMurdie and Holmes (2013). [phyloseq: An R package for reproducible interactive analysis and graphics of microbiome census data](http://dx.plos.org/10.1371/journal.pone.0061217). PLoS ONE e61217.
+
+The most updated examples are posted in our online tutorials from
+[the phyloseq home page](http://joey711.github.com/phyloseq)
+
+A separate vignette describes analysis tools included in phyloseq along with various examples using included example data. A quick way to load it is:
+
+```{r, eval=FALSE}
+vignette("phyloseq_analysis")
+```
+
+By contrast, this vignette is intended to provide functional examples of the basic data import and manipulation infrastructure included in phyloseq. This includes example code for importing OTU-clustered data from different clustering pipelines, as well as performing clear and reproducible filtering tasks that can be altered later and checked for robustness. The motivation for including tools like this in phyloseq is to save time, and also to build-in a structure that requires consistenc [...]
+
+
+# phyloseq classes <a id="sec:app-classes"></a>
+
+The class structure in the *phyloseq* package follows the inheritance diagram shown in the figure below.
+Currently, *phyloseq* uses 4 core data classes.
+They are
+(1) the OTU abundance table (`otu_table`),
+a table of sample data (`sample_data`);
+(2) a table of taxonomic descriptors (`taxonomyTable`); and
+(3) a phylogenetic tree (`"phylo"`-class, [ape package](http://cran.r-project.org/web/packages/ape/).
+
+The `otu_table` class can be considered the central data type,
+as it directly represents the number and type of sequences observed in each sample.
+`otu_table` extends the numeric matrix class in the `R` base,
+and has a few additonal feature slots.
+The most important of these feature slots is the `taxa_are_rows` slot,
+which holds a single logical that indicates whether the table is oriented
+with taxa as rows (as in the *genefilter* package in [Bioconductor](#cite:bioconductor)
+or with taxa as columns (as in *vegan* and *picante* packages).
+In *phyloseq* methods, as well as its extensions of methods in other packages,
+the `taxa_are_rows` value is checked to ensure proper orientation of the `otu_table`.
+A *phyloseq* user is only required to specify the `otu_table` orientation during initialization, following which all handling is internal.
+
+The `sample_data` class directly inherits `R`'s `data.frame` class, and thus effectively stores both categorical and numerical data about each sample. The orientation of a `data.frame` in this context requires that samples/trials are rows, and variables are columns (consistent with *vegan* and other packages). The `taxonomyTable` class directly inherits the `matrix` class, and is oriented such that rows are taxa/OTUs and columns are taxonomic levels (e.g. *Phylum*).
+
+The phyloseq-class can be considered an "experiment-level class" and should contain two or more of the previously-described core data classes. We assume that *phyloseq* users will be interested in analyses that utilize their abundance counts derived from the phylogenetic sequencing data, and so the `phyloseq()` constructor will stop with an error if the arguments do not include an `otu_table`. There are a number of common methods that require either an `otu_table` and `sample_data` combi [...]
+
+![phyloseq class structure](phyloseq_classes_7.png)
+ Classes and inheritance in the *phyloseq* package. The class name and its slots are shown with red- or blue-shaded text, respectively. Coercibility is indicated graphically by arrows with the coercion function shown. Lines without arrows indicate that the more complex class (``phyloseq") contains a slot with the associated data class as its components.
+
+
+# Load *phyloseq* and import data <a id="sec:load"></a>
+
+Now let's get started by loading phyloseq, and describing some methods for importing data.
+
+## Load *phyloseq*
+
+To use *phyloseq* in a new R session, it will have to be loaded. This can be done in your package manager, or at the command line using the `library()` command:
+```{r load-packages, message=FALSE, warning=FALSE}
+library("phyloseq")
+```
+
+## Import data
+
+An important feature of *phyloseq* are methods
+for importing phylogenetic sequencing data
+from common taxonomic clustering pipelines.
+These methods take file pathnames as input,
+read and parse those files,
+and return a single object that contains all of the data.
+
+Some additional background details are provided below.
+The best reproducible examples on importing data with phyloseq
+can be found on the official data import tutorial page:
+
+http://joey711.github.com/phyloseq/import-data
+
+
+## Import from biom-format <a id="sec:biom"></a>
+
+New versions of QIIME (see below) produce a file in *version 2* of the
+[biom file format](http://biom-format.org/),
+which is a specialized definition of the HDF5 format.
+
+The phyloseq package provides the `import_biom()` function,
+which can import both
+*Version 1* (JSON) and
+*Version 2* (HDF5)
+of the BIOM file format.
+
+The *phyloseq* package fully supports
+both taxa and sample observations of the biom format standard,
+and works with the BIOM files output from QIIME, RDP, MG-RAST, etc.
+
+
+## Import from QIIME (Modern)<a id="sec:qiimeimport"></a>
+
+The default output from modern versions of QIIME
+is a BIOM-format file (among others).
+This is suppored in phyloseq.
+
+### Sample data from QIIME
+
+Sometimes inaccurately referred to as *metadata*,
+additional observations on samples provided as *mapping file* to QIIME
+have not typically been output in the BIOM files,
+**even though BIOM format supports it**.
+This failure to support the full capability of the BIOM format
+means that you'll have to provide sample observations as a separate file.
+There are many ways to do this, but the QIIME sample map is supported.
+
+### Input
+
+Two QIIME output files (`.biom`, `.tre`)
+are recognized by the `import_biom()` function.
+One QIIME input file (sample map, tab-delimited),
+is recognized by the `import_qiime_sample_data()` function.
+
+---
+ Input File(s) | phyloseq function | Output
+ --- | --- | ---
+ `.biom`, `.tre` | `import_biom()` | phyloseq object with OTU table, taxonomy table, and tree (if provided)
+ `.tre` | `read_tree()` | `phylo` object, representing phylogenetic tree.
+ `map.txt` | `import_qiime_sample_data()` | A `sample_data` object
+---
+
+The objects created by each of the import functions above
+should be merged using `merge_phyloseq` to create one coordinated, self-consistent object.
+
+### Output
+
+- **Before Merging** - Before merging with `merge_phyloseq`, the output from these import activities is the three separate objects listed in the previous table.
+- **After Merging** - After merging you have a single self-consistent phyloseq object
+that contains an OTU table, taxonomy table, sample-data, and a phylogenetic tree.
+
+### QIIME Example Tutorial
+
+QIIME's "Moving Pictures" example tutorial output is a little too large
+to include within the phyloseq package
+(and thus is not directly included in this vignette).
+However, the phyloseq home page includes
+a full reproducible example of the import procedure described above:
+
+**Link HERE**
+
+For reference, or if you want to try yourself,
+the following is the relative paths within the QIIME tutorial directory
+for each of the files you will need.
+
+- BIOM file, originally at:
+`r "moving_pictures_tutorial-1.9.0/illumina/precomputed-output/otus/otu_table_mc2_w_tax_no_pynast_failures.biom"`
+- Tree file, originally at:
+`r "moving_pictures_tutorial-1.9.0/illumina/precomputed-output/otus/rep_set.tre"`
+- Map File, originally at:
+`r "moving_pictures_tutorial-1.9.0/illumina/map.tsv"`
+
+
+## Import from QIIME Legacy<a id="sec:qiimeimportleg"></a>
+
+[QIIME](#cite:QIIME) is a free, open-source OTU clustering and analysis pipeline written for Unix (mostly Linux). It is distributed in a number of different forms (including a pre-installed virtual machine). See [the QIIME home page](http://qiime.org/) for details.
+
+### Input
+
+One QIIME input file (sample map), and two QIIME output files (`otu_table.txt`, `.tre`) are recognized by the `import_qiime()` function. Only one of the three input files is required to run, although an `"otu_table.txt"` file is required if `import_qiime()` is to return a complete experiment object.
+
+In practice, you will have to find the relevant QIIME files among a number of other files created by the QIIME pipeline. A screenshot of the directory structure created during a typical QIIME run is shown in [the QIIME Directory Figure](#fig:qiimedirectory).
+
+
+<a id="fig:qiimedirectory"></a>
+![QIIME directory structure](import_qiime_directory_structure.jpg)
+ A typical QIIME output directory. The two output files suitable for import by *phyloseq* are highlighted. A third file describing the samples, their barcodes and covariates, is created by the user and required as *input* to QIIME. It is a good idea to import this file, as it can be converted directly to a `sample_data` object and can be extremely useful for certain analyses.
+
+
+### Output
+
+The class of the object returned by `import_qiime()` depends upon which filenames are provided. The most comprehensive class is chosen automatically, based on the input files listed as arguments. At least one argument needs to be provided.
+
+
+
+## Import from mothur <a id="sec:mothurimport"></a>
+
+The open-source, platform-independent, locally-installed software package, [mothur](#cite:Schloss:2009do), can also process barcoded amplicon sequences and perform OTU-clustering. It is extensively documented on [the mothur wiki](http://www.mothur.org/wiki/)
+
+### Input
+
+Currently, there are three different files produced by the *mothur* package (Ver `1.22+`) that can be imported by *phyloseq*. At minimum, a user must supply a "`.list`" file, and at least one of the following two files: `.groups` or `.tree`. The group file is produced by *mothur*'s `make.group()` function. Details can be found at [its wiki page](http://www.mothur.org/wiki/Make.group). The tree file is a phylogenetic tree calculated by *mothur*.
+
+### Output
+
+The output from `import_mothur()` depends on which file types are provided. If all three file types are provided, an instance of the phyloseq-class is returned that contains both an OTU abundance table and its associated phylogenetic tree.
+
+
+## Import from PyroTagger
+
+PyroTagger is an OTU-clustering pipeline for barcoded 16S rRNA amplicon sequences, served and maintained by the Department of Energy's (DOE's) Joint Genome Institute (JGI). It can be used through a straightforward web interface at [the PyroTagger home page](http://pyrotagger.jgi-psf.org/)
+
+PyroTagger takes as input the untrimmed sequence (`.fasta`) and sequence-quality (`.qual`) files, as well as a sample mapping file that contains the bar code sequence for each sample and its name. It uses a 97\% identity threshold for defining OTU clusters (approximately species-level of taxonomic distinction), and provides no options for specifying otherwise. It does allow users to modify the threshold setting for low-quality bases.
+
+### Input
+
+PyroTagger returns a single excel spreadsheet file (`.xls`) containing both abundance and taxonomy data, as well as some associated confidence information related to each taxonomic assignment. This spreadsheet also reports on potential chimeric sequences. This single output file is sufficient for `import_RDP_tab()`, provided the file has been converted to a tab-delimited plain-text format. Any spreadsheet application should suffice. No other changes should be made to the `.xls` file.
+
+### Output
+
+`import_RDP_tab()` returns an instance of the phyloseq-class that contains the OTU abundance table and taxonomy table. To my knowledge, PyroTagger does not calculate a tree of the representative sequences from each OTU cluster, nor a distance object, so analyses like `tip_glom()` and `UniFrac` are not applicable.
+
+
+## Import from RDP pipeline
+
+The Ribosomal Database Project ([RDP](http://rdp.cme.msu.edu/)) provides a web-based barcoded 16S rRNA amplicon sequence processing pipeline called the [RDP Pyrosequencing Pipeline](http://pyro.cme.msu.edu/). A user must run all three of the "Data Processing" steps sequentially through the web interface in order to acquire the output from Complete Linkage Clustering, the approach to OTU clustering used by the RDP Pipeline. Note that this import function assumes that the sequence names in [...]
+
+### Input
+
+The output from the Complete Linkage Clustering, `.clust`, is the only input to the RDP pipeline importer:
+
+```{r, eval=FALSE}
+myOTU1 <- import_RDP_cluster("path/to/my/filename.clust")
+```
+
+### Output
+
+This importer returns an `otu_table` object.
+
+### Expected Naming Convention
+
+The RDP cluster pipeline (specifically, the output of the complete linkage clustering step) has no formal documentation for the ".clust" file structure or its apparent sequence naming convention.
+
+The cluster file itself contains the names of all sequences contained in the input alignment. If the upstream barcode and aligment processing steps are also done with the RDP pipeline, then the sequence names follow a predictable naming convention wherein each sequence is named by its sample and sequence ID, separated by a `"_"` as delimiter:
+
+`sampleName_sequenceIDnumber`
+
+This import function assumes that the sequence names in the cluster file follow this convention, and that the sample name does not contain any `"_"`. It is unlikely to work if this is not the case. It is likely to work if you used the upstream steps in the RDP pipeline to process your raw (barcoded, untrimmed) fasta/fastq data.
+
+
+
+## Example Data (included)
+
+There are multiple example data sets included in *phyloseq*. Many are from published investigations and include documentation with a summary and references, as well as some example code representing some aspect of analysis available in *phyloseq*. In the package index, go to the names beginning with "data-" to see the documentation of currently available example datasets.
+
+To load example data into the working environment, use the `data()` command:
+
+```{r, eval=FALSE}
+data(GlobalPatterns)
+data(esophagus)
+data(enterotype)
+data(soilrep)
+```
+
+Similarly, entering `?enterotype` will reveal the documentation for the so-called "enterotype" dataset. For details examples, see [the Example Data tutorial](http://joey711.github.io/phyloseq/Example-Data.html)
+
+## phyloseq Object Summaries
+
+In small font, the following is the summary of the `GlobalPatterns` dataset that prints to the terminal. These summaries are consistent among all `phyloseq-class` objects. Although the components of `GlobalPatterns` have many thousands of elements, the command-line returns only a short summary of each component. This encourages you to check that an object is still what you expect, without needing to let thousands of elements scroll across the terminal. In the cases in which you do want t [...]
+
+
+```{r}
+data(GlobalPatterns)
+GlobalPatterns
+```
+
+
+## Convert raw data to phyloseq components
+
+Suppose you have already imported raw data from an experiment into `R`, and their indices are labeled correctly. How do you get *phyloseq* to recognize these tables as the appropriate class of data? And further combine them together? Table [Table of Component Constructor Functions](#table:build) lists key functions for converting these core data formats into specific component data objects recognized by *phyloseq*. These will also
+
+ Table of component constructor functions for building component data objects <a id="table:build"></a>
+
+---
+ Function | Input Class | Output Description
+ --- | --- | ---
+ `otu_table` | numeric matrix | `otu_table` object storing OTU abundance
+ `otu_table` | data.frame | `otu_table` object storing OTU abundance
+ `sample_data` | data.frame | `sample_data` object storing sample variables
+ `tax_table` | character matrix | `taxonomyTable` object storing taxonomic identities
+ `tax_table` | data.frame | `taxonomyTable` object storing taxonomic identities
+ `read_tree` | file path char | phylo-class tree, read from file
+ `read.table` | table file path | A matrix or data.frame (Std `R` core function)
+---
+
+ phyloseq constructors: functions for building/merging *phyloseq* objects.
+
+---
+Function | Input Class | Output Description
+--- | --- | ---
+`phyloseq` | Two or more component objects | phyloseq-class, *experiment-level* object
+`merge_phyloseq`| Two or more component or phyloseq-class objects | Combined instance of phyloseq-class
+---
+
+The following example illustrates using the constructor methods for component data tables.
+
+```{r, eval=FALSE}
+otu1 <- otu_table(raw_abundance_matrix, taxa_are_rows=FALSE)
+sam1 <- sample_data(raw_sample_data.frame)
+tax1 <- tax_table(raw_taxonomy_matrix)
+tre1 <- read_tree(my_tree_file)
+```
+
+## phyloseq() function: building complex phyloseq objects
+
+Once you've converted the data tables to their appropriate class, combining them into one object requires only one additional function call, `phyloseq()`:
+```{r, eval=FALSE}
+ex1b <- phyloseq(my_otu_table, my_sample_data, my_taxonomyTable, my_tree)
+```
+
+You do not need to have all four data types in the example above in order to combine them into one validity-checked experiment-level phyloseq-class object. The `phyloseq()` method will detect which component data classes are present, and build accordingly. Downstream analysis methods will access the required components using *phyloseq*'s accessors, and throw an error if something is missing. For most downstream methods you will only need to supply the combined, phyloseq-class object (the [...]
+```{r, eval=FALSE}
+ex1c <- phyloseq(my_otu_table, my_sample_data)
+```
+
+Whenever an instance of the phyloseq-class is created by *phyloseq* --- for example, when we use the `import_qiime()` function to import data, or combine manually imported tables using `phyloseq()` --- the row and column indices representing taxa or samples are internally checked/trimmed for compatibility, such that all component data describe exactly (and only) the same OTUs and samples.
+
+## Merge
+
+The phyloseq project includes support for two complete different categories of merging.
+
+ - Merging the OTUs or samples in a phyloseq object, based upon a taxonomic or sample variable: `merge_samples()`, `merge_taxa()`
+ - Merging two or more data objects that come from the same experiment, so that their data becomes part of the same phyloseq object: `merge_phyloseq()`
+
+For further details, see the reproducible online tutorial at:
+
+http://joey711.github.com/phyloseq/merge
+
+
+
+# Accessor functions <a id="sec:accessors"></a>
+
+Once you have a phyloseq object available, many accessor functions are available to query aspects of the data set. The function name and its purpose are summarized in [the Accessor Functions Table](#table:access).
+
+ Accessor functions for *phyloseq* objects.
+
+<a id="table:access"></a>
+
+---
+Function | Returns
+--- | ---
+ `[` | Standard extraction operator. Works on `otu_table`, `sample_data`, and `taxonomyTable`
+ `access` | General slot accessor function for phyloseq-package
+ `get_taxa` | Abundance values of all taxa in sample `i'
+ `get_sample` | Abundance values of taxa `i' for all samples
+ `get_taxa_unique` | A unique vector of the observed taxa at a particular taxonomic rank
+ `get_variable` | An individual sample variable vector/factor
+ `nsamples` | Get the number of samples described by an object
+ `ntaxa` | Get the number of OTUs (taxa) described by an object
+ `otu_table` | Build or access otu_table objects
+ `rank_names` | Get the names of the available taxonomic ranks
+ `sample_data` | Build or access `sample_data` objects
+ `sample_names` | The names of all samples
+ `taxa_names` | The names of all taxa
+ `sample_sums` | The sum of the abundance values of each sample
+ `sample_variables` | The names of sample variables
+ `taxa_sums` | The sum of the abundance values of each taxa
+ `taxa_are_rows` | `TRUE` if taxa are row indices in `otu_table`
+ `tax_table` | A taxonomy table
+ `phy_tree` | Access the tree contained in a phyloseq object
+---
+
+
+
+# Trimming, subsetting, filtering phyloseq data <a id="sec:trim"></a>
+
+## Trimming: prune_taxa()
+Trimming high-throughput phylogenetic sequencing data can be useful, or even necessary, for certain types of analyses. However, it is important that the original data always be available for reference and reproducibility; and that the methods used for trimming be transparent to others, so they can perform the same trimming or filtering steps on the same or related data. To facilitate this, *phyloseq* contains many ways to trim/filter the data from a phylogenetic sequencing project. Becau [...]
+
+In general, most trimming should be accomplished using the S4 methods `prune_taxa()` or `prune_samples()`.
+
+## Simple filtering example
+
+```{r echo=FALSE}
+topN <- 20
+```
+
+For example, lets make a new object that only holds the most abundant `r topN` taxa in the experiment. To accomplish this, we will use the `prune_taxa()` function.
+
+```{r}
+data(GlobalPatterns)
+most_abundant_taxa <- sort(taxa_sums(GlobalPatterns), TRUE)[1:topN]
+ex2 <- prune_taxa(names(most_abundant_taxa), GlobalPatterns)
+```
+
+Now we can ask the question, "what taxonomic Family are these OTUs?" (Subsetting still returns a `taxonomyTable` object, which is summarized. We will need to convert to a vector)
+
+```{r}
+topFamilies <- tax_table(ex2)[, "Family"]
+as(topFamilies, "vector")
+```
+
+## Arbitrarily complex abundance filtering
+
+The previous example was a relatively simple filtering in which we kept only the most abundant `r topN` in the whole experiment. But what if we wanted to keep the most abundant `r topN` taxa of each sample? And of those, keep only the taxa that are also found in at least one-third of our samples? What if we wanted to keep only those taxa that met some across-sample criteria?
+
+### genefilter_sample(): Filter by Within-Sample Criteria
+
+For this more complicated filtering *phyloseq* contains a function, `genefilter_sample`, that takes as an argument a *phyloseq* object, as well as a list of one or more filtering functions that will be applied to each sample in the abundance matrix (`otu_table`), as well as an integer argument, `A`, that specifies for how many samples the filtering function must return `TRUE` for a particular taxa to avoid removal from the object. A supporting function `filterfun_sample` is also included [...]
+
+Here is an example on a completely fabricated `otu_table` called `testOTU`.
+```{r, eval=FALSE}
+testOTU <- otu_table(matrix(sample(1:50, 25, replace=TRUE), 5, 5), taxa_are_rows=FALSE)
+f1<- filterfun_sample(topk(2))
+wh1 <- genefilter_sample(testOTU, f1, A=2)
+wh2 <- c(T, T, T, F, F)
+prune_taxa(wh1, testOTU)
+prune_taxa(wh2, testOTU)
+```
+
+Here is a second example using the included dataset, `GlobalPatterns`. The most abundant taxa are kept only if they are in the most abundant 10\% of taxa in at least half of the samples in dataset `GlobalPatterns`. Note that it is not necessary to subset `GlobalPatterns` in order to do this filtering. The S4 method `prune_taxa` subsets each of the relavent component objects, and returns the complex object back.
+
+```{r}
+data(GlobalPatterns)
+f1<- filterfun_sample(topp(0.1))
+wh1 <- genefilter_sample(GlobalPatterns, f1, A=(1/2*nsamples(GlobalPatterns)))
+sum(wh1)
+ex2 <- prune_taxa(wh1, GlobalPatterns)
+```
+
+```{r}
+print(ex2)
+```
+
+If instead of the most abundant fraction of taxa, you are interested in the most abundant fraction of individuals (aka sequences, observations), then the `topf` function is appropriate. For steep rank-abundance curves, `topf` will seem to be much more conservative (trim more taxa) because it is based on the cumulative sum of relative abundance. It does not guarantee that a certain number or fraction of total taxa (richness) will be retained.
+
+```{r, eval=FALSE}
+data(GlobalPatterns)
+f1<- filterfun_sample(topf(0.9))
+wh1 <- genefilter_sample(GlobalPatterns, f1, A=(1/3*nsamples(GlobalPatterns)))
+sum(wh1)
+prune_taxa(wh1, GlobalPatterns)
+```
+
+### filter_taxa(): Filter by Across-Sample Criteria
+
+The `filter_taxa` function is directly analogous to the `genefilter` function for microarray filtering, but is used for filtering OTUs from phyloseq objects. It applies an arbitrary set of functions -- as a function list, for instance, created by `genefilter::filterfun` -- as across-sample criteria, one OTU at a time. It can be thought of as an extension of the genefilter-package (from the Bioconductor repository) for phyloseq objects. It takes as input a phyloseq object, and returns a l [...]
+
+Inspect the following example. Note that the functions `genefilter` and `kOverA` are from the genefilter package.
+
+```{r}
+data("enterotype")
+library("genefilter")
+flist<- filterfun(kOverA(5, 2e-05))
+ent.logi <- filter_taxa(enterotype, flist)
+ent.trim <- filter_taxa(enterotype, flist, TRUE)
+identical(ent.trim, prune_taxa(ent.logi, enterotype))
+identical(sum(ent.logi), ntaxa(ent.trim))
+filter_taxa(enterotype, flist, TRUE)
+```
+
+## subset_samples(): Subset by Sample Variables
+
+It is possible to subset the samples in a *phyloseq* object based on the sample variables using the `subset_samples()` function. For example to subset `GlobalPatterns` such that only certain environments are retained, the following line is needed (the related tables are subsetted automatically as well):
+
+```{r}
+ex3 <- subset_samples(GlobalPatterns, SampleType%in%c("Freshwater", "Ocean", "Freshwater (creek)"))
+ex3
+```
+
+For this example only a categorical variable is shown, but in principle a continuous variable could be specified and a logical expression provided just as for the `subset` function. In fact, because `sample_data` component objects are an extension of the data.frame class, they can also be subsetted with the `subset` function:
+
+```{r}
+subset(sample_data(GlobalPatterns), SampleType%in%c("Freshwater", "Ocean", "Freshwater (creek)"))
+```
+
+## subset_taxa(): subset by taxonomic categories
+
+It is possible to subset by specific taxonomic category using the `subset_taxa()` function. For example, if we wanted to subset `GlobalPatterns` so that it only contains data regarding the phylum *Firmicutes*:
+
+```{r}
+ex4 <- subset_taxa(GlobalPatterns, Phylum=="Firmicutes")
+ex4
+```
+
+## random subsample abundance data
+
+Can also randomly subset, for example a random subset of 100 taxa from the full dataset.
+
+```{r}
+randomSpecies100 <- sample(taxa_names(GlobalPatterns), 100, replace=FALSE)
+ex5 <- prune_taxa(randomSpecies100, GlobalPatterns)
+```
+
+
+# Transform abundance data<a id="sec:transform"></a>
+
+Sample-wise transformation can be achieved with the `transform_sample_counts()` function. It requires two arguments, (1) the *phyloseq* object that you want to transform, and the function that you want to use to perform the transformation. Any arbitrary function can be provided as the second argument, as long as it returns a numeric vector with the same length as its input. In the following trivial example, we create a second object, `ex2`, that has been "transformed" by the identity fun [...]
+
+```{r, eval=FALSE}
+data(GlobalPatterns)
+ex2 <- transform_sample_counts(GlobalPatterns, I)
+```
+
+For certain kinds of analyis we may want to transform the abundance data. For example, for RDA we want to transform abundance counts to within-sample ranks, and to further include a threshold beyond which all taxa receive the same rank value. The ranking for each sample is performed independently, so that the rank of a particular taxa within a particular sample is not influenced by that sample's total quantity of sequencing relative to the other samples in the project.
+
+The following example shows how to perform such a thresholded-rank transformation of the abundance table in the complex *phyloseq* object `GlobalPatterns` with an arbitrary threshold of 500.
+
+```{r}
+ex4<- transform_sample_counts(GlobalPatterns, threshrankfun(500))
+```
+
+
+# Phylogenetic smoothing <a id="sec:glom"></a>
+
+## tax_glom()
+
+Suppose we are skeptical about the importance of OTU-level distinctions in our dataset. For this scenario, *phyloseq* includes a taxonomic-agglommeration method,`tax_glom()`, which merges taxa of the same taxonomic category for a user-specified taxonomic level. In the following code, we merge all taxa of the same Genus, and store that new object as `ex6`.
+
+```{r, eval=FALSE}
+ex6 <- tax_glom(GlobalPatterns, taxlevel="Genus")
+```
+
+## tip_glom()
+
+Similarly, our original example object (`GlobalPatterns`) also contains a phlyogenetic tree corresponding to each OTU, which we could also use as a means to merge taxa in our dataset that are closely related. In this case, we specify a threshold patristic distance. Taxa more closely related than this threshold are merged. This is especially useful when a dataset has many taxa that lack a taxonomic assignment at the level you want to investigate, a problem when using `tax_glom()`. Note th [...]
+
+```{r, eval=FALSE}
+ex7 <- tip_glom(GlobalPatterns, speciationMinLength = 0.05)
+```
+
+Command output not provided here to save time during compilation of the vignette. The user is encouraged to try this out on your dataset, or even this example, if interested. It may take a while to run on the full, untrimmed data.
+
+
+# Installation
+
+## Installation
+
+Please check [the phyloseq installation tutorial](http://joey711.github.com/phyloseq/install) for help with installation. This is likely to be the first place news and updated information about installation will be posted, as well. Also check out the rest of [the phyloseq homepage on GitHub](http://joey711.github.io/phyloseq/), as this is the best place to post issues, bug reports, feature requests, contribute code, etc.
+
+## Installing Parallel Backend
+
+For running parallel implementation of functions/methods in *phyloseq* (e.g. `UniFrac(GlobalPatterns, parallel=TRUE)`), you will need also to install a function for registering a parallel "backend". Only one working parallel backend is needed, but there are several options, and the best one will depend on the details of your particular system. The "doParallel" package is a good place to start. Any one of the following lines from an `R` session will install a backend package.
+
+```{r, eval=FALSE}
+install.packages("doParallel")
+install.packages("doMC")
+install.packages("doSNOW")
+install.packages("doMPI")
+```
+
+
+# References
+
+<a id="cite:bioconductor"></a>
+Robert C Gentleman, Vincent J. Carey, Douglas M. Bates, et al. **Bioconductor: Open software development for computational biology and bioinformatics.** *Genome Biology* 5:R80, 2004.
+
+<a id="cite:QIIME"></a>
+J Gregory Caporaso, Justin Kuczynski, Jesse Stombaugh, Kyle Bittinger, Frederic D Bushman **QIIME allows analysis of high-throughput community sequencing data.** *Nature Methods* 7(5):335-336, 2010.
+
+<a id="cite:Schloss:2009do"></a>
+P D Schloss, S L Westcott, T Ryabin, J R Hall, M Hartmann, et al. **Introducing mothur: Open-Source, Platform-Independent, Community-Supported Software for Describing and Comparing Microbial Communities.** *Applied and Environmental Microbiology* 75(23):7537-7541, 2009.
+
+<a id="cite:RDP"></a>
+J R Cole, Q Wang, E Cardenas, J Fish, B Chai et al. **The Ribosomal Database Project: improved alignments and new tools for rRNA analysis.** *Nucleic Acids Research* 37(Database issue):D141-5, 2009.
diff --git a/inst/doc/phyloseq-basics.html b/inst/doc/phyloseq-basics.html
new file mode 100644
index 0000000..122d664
--- /dev/null
+++ b/inst/doc/phyloseq-basics.html
@@ -0,0 +1,509 @@
+<!DOCTYPE html>
+
+<html xmlns="http://www.w3.org/1999/xhtml">
+
+<head>
+
+<meta charset="utf-8">
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+<meta name="generator" content="pandoc" />
+
+
+
+<title>Basic storage, access, and manipulation of phylogenetic sequencing data with phyloseq</title>
+
+<link href="data:text/css;charset=utf-8,pre%20%2Eoperator%2C%0Apre%20%2Eparen%20%7B%0Acolor%3A%20rgb%28104%2C%20118%2C%20135%29%0A%7D%0Apre%20%2Eliteral%20%7B%0Acolor%3A%20%23990073%0A%7D%0Apre%20%2Enumber%20%7B%0Acolor%3A%20%23099%3B%0A%7D%0Apre%20%2Ecomment%20%7B%0Acolor%3A%20%23998%3B%0Afont%2Dstyle%3A%20italic%0A%7D%0Apre%20%2Ekeyword%20%7B%0Acolor%3A%20%23900%3B%0Afont%2Dweight%3A%20bold%0A%7D%0Apre%20%2Eidentifier%20%7B%0Acolor%3A%20rgb%280%2C%200%2C%200%29%3B%0A%7D%0Apre%20%2Estri [...]
+<script src="data:application/x-javascript;base64,dmFyIGhsanM9bmV3IGZ1bmN0aW9uKCl7ZnVuY3Rpb24gbShwKXtyZXR1cm4gcC5yZXBsYWNlKC8mL2dtLCImYW1wOyIpLnJlcGxhY2UoLzwvZ20sIiZsdDsiKX1mdW5jdGlvbiBmKHIscSxwKXtyZXR1cm4gUmVnRXhwKHEsIm0iKyhyLmNJPyJpIjoiIikrKHA/ImciOiIiKSl9ZnVuY3Rpb24gYihyKXtmb3IodmFyIHA9MDtwPHIuY2hpbGROb2Rlcy5sZW5ndGg7cCsrKXt2YXIgcT1yLmNoaWxkTm9kZXNbcF07aWYocS5ub2RlTmFtZT09IkNPREUiKXtyZXR1cm4gcX1pZighKHEubm9kZVR5cGU9PTMmJnEubm9kZVZhbHVlLm1hdGNoKC9ccysvKSkpe2JyZWFrfX19ZnVuY3Rpb24gaCh0LH [...]
+
+<style type="text/css">code{white-space: pre;}</style>
+<style type="text/css">
+ pre:not([class]) {
+ background-color: white;
+ }
+</style>
+<script type="text/javascript">
+if (window.hljs && document.readyState && document.readyState === "complete") {
+ window.setTimeout(function() {
+ hljs.initHighlighting();
+ }, 0);
+}
+</script>
+
+
+<link href="data:text/css;charset=utf-8,body%2C%20td%20%7B%0Afont%2Dfamily%3A%20sans%2Dserif%3B%0Abackground%2Dcolor%3A%20white%3B%0Afont%2Dsize%3A%2013px%3B%0A%7D%0Abody%20%7B%0Amax%2Dwidth%3A%20800px%3B%0Amargin%3A%200%20auto%3B%0Apadding%3A%201em%201em%202em%3B%0Aline%2Dheight%3A%2020px%3B%0A%7D%0A%0Adiv%23TOC%20li%20%7B%0Alist%2Dstyle%3Anone%3B%0Abackground%2Dimage%3Anone%3B%0Abackground%2Drepeat%3Anone%3B%0Abackground%2Dposition%3A0%3B%0A%7D%0A%0Ap%2C%20pre%20%7B%20margin%3A%200em%2 [...]
+
+<script type="text/javascript">
+document.addEventListener("DOMContentLoaded", function() {
+ var links = document.links;
+ for (var i = 0, linksLength = links.length; i < linksLength; i++)
+ if(links[i].hostname != window.location.hostname)
+ links[i].target = '_blank';
+});
+</script>
+
+</head>
+
+<body>
+
+
+<div id="header">
+<h1 class="title">Basic storage, access, and manipulation of phylogenetic sequencing data with <em>phyloseq</em></h1>
+</div>
+
+<h1>Contents</h1>
+<div id="TOC">
+<ul>
+<li><a href="#other-resources"><span class="toc-section-number">0.1</span> Other resources</a></li>
+<li><a href="#introduction"><span class="toc-section-number">1</span> Introduction</a></li>
+<li><a href="#about-this-vignette"><span class="toc-section-number">2</span> About this vignette</a><ul>
+<li><a href="#typesetting-legend"><span class="toc-section-number">2.1</span> Typesetting Legend <a id="sec:typeset-legend"></a></a></li>
+<li><a href="#other-links-and-tutorials"><span class="toc-section-number">2.2</span> Other links and tutorials</a></li>
+</ul></li>
+<li><a href="#phyloseq-classes"><span class="toc-section-number">3</span> phyloseq classes <a id="sec:app-classes"></a></a></li>
+<li><a href="#load-phyloseq-and-import-data"><span class="toc-section-number">4</span> Load <em>phyloseq</em> and import data <a id="sec:load"></a></a><ul>
+<li><a href="#load-phyloseq"><span class="toc-section-number">4.1</span> Load <em>phyloseq</em></a></li>
+<li><a href="#import-data"><span class="toc-section-number">4.2</span> Import data</a></li>
+<li><a href="#import-from-biom-format"><span class="toc-section-number">4.3</span> Import from biom-format <a id="sec:biom"></a></a></li>
+<li><a href="#import-from-qiime-modern"><span class="toc-section-number">4.4</span> Import from QIIME (Modern)<a id="sec:qiimeimport"></a></a></li>
+<li><a href="#import-from-qiime-legacy"><span class="toc-section-number">4.5</span> Import from QIIME Legacy<a id="sec:qiimeimportleg"></a></a></li>
+<li><a href="#import-from-mothur"><span class="toc-section-number">4.6</span> Import from mothur <a id="sec:mothurimport"></a></a></li>
+<li><a href="#import-from-pyrotagger"><span class="toc-section-number">4.7</span> Import from PyroTagger</a></li>
+<li><a href="#import-from-rdp-pipeline"><span class="toc-section-number">4.8</span> Import from RDP pipeline</a></li>
+<li><a href="#example-data-included"><span class="toc-section-number">4.9</span> Example Data (included)</a></li>
+<li><a href="#phyloseq-object-summaries"><span class="toc-section-number">4.10</span> phyloseq Object Summaries</a></li>
+<li><a href="#convert-raw-data-to-phyloseq-components"><span class="toc-section-number">4.11</span> Convert raw data to phyloseq components</a></li>
+<li><a href="#phyloseq-function-building-complex-phyloseq-objects"><span class="toc-section-number">4.12</span> phyloseq() function: building complex phyloseq objects</a></li>
+<li><a href="#merge"><span class="toc-section-number">4.13</span> Merge</a></li>
+</ul></li>
+<li><a href="#accessor-functions"><span class="toc-section-number">5</span> Accessor functions <a id="sec:accessors"></a></a></li>
+<li><a href="#trimming-subsetting-filtering-phyloseq-data"><span class="toc-section-number">6</span> Trimming, subsetting, filtering phyloseq data <a id="sec:trim"></a></a><ul>
+<li><a href="#trimming-prune_taxa"><span class="toc-section-number">6.1</span> Trimming: prune_taxa()</a></li>
+<li><a href="#simple-filtering-example"><span class="toc-section-number">6.2</span> Simple filtering example</a></li>
+<li><a href="#arbitrarily-complex-abundance-filtering"><span class="toc-section-number">6.3</span> Arbitrarily complex abundance filtering</a></li>
+<li><a href="#subset_samples-subset-by-sample-variables"><span class="toc-section-number">6.4</span> subset_samples(): Subset by Sample Variables</a></li>
+<li><a href="#subset_taxa-subset-by-taxonomic-categories"><span class="toc-section-number">6.5</span> subset_taxa(): subset by taxonomic categories</a></li>
+<li><a href="#random-subsample-abundance-data"><span class="toc-section-number">6.6</span> random subsample abundance data</a></li>
+</ul></li>
+<li><a href="#transform-abundance-data"><span class="toc-section-number">7</span> Transform abundance data<a id="sec:transform"></a></a></li>
+<li><a href="#phylogenetic-smoothing"><span class="toc-section-number">8</span> Phylogenetic smoothing <a id="sec:glom"></a></a><ul>
+<li><a href="#tax_glom"><span class="toc-section-number">8.1</span> tax_glom()</a></li>
+<li><a href="#tip_glom"><span class="toc-section-number">8.2</span> tip_glom()</a></li>
+</ul></li>
+<li><a href="#installation"><span class="toc-section-number">9</span> Installation</a><ul>
+<li><a href="#installation-1"><span class="toc-section-number">9.1</span> Installation</a></li>
+<li><a href="#installing-parallel-backend"><span class="toc-section-number">9.2</span> Installing Parallel Backend</a></li>
+</ul></li>
+<li><a href="#references"><span class="toc-section-number">10</span> References</a></li>
+</ul>
+</div>
+
+<!--
+%% \VignetteEngine{knitr::rmarkdown}
+%% \VignetteIndexEntry{phyloseq basics vignette}
+-->
+<p>Paul J. McMurdie and Susan Holmes</p>
+<p><a href="mailto:mcmurdie at stanford.edu">mcmurdie at stanford.edu</a></p>
+<p><a href="http://joey711.github.io/phyloseq/">phyloseq Home Page</a></p>
+<p>If you find phyloseq and/or its tutorials useful, please acknowledge and cite phyloseq in your publications:</p>
+<p><strong>phyloseq: An R package for reproducible interactive analysis and graphics of microbiome census data</strong> (2013) PLoS ONE 8(4):e61217 <a href="http://dx.plos.org/10.1371/journal.pone.0061217" class="uri">http://dx.plos.org/10.1371/journal.pone.0061217</a></p>
+<div id="other-resources" class="section level2">
+<h2><span class="header-section-number">0.1</span> Other resources</h2>
+<p>The phyloseq project also has a number of supporting online resources, most of which can by found at <a href="http://joey711.github.com/phyloseq/">the phyloseq home page</a>, or from the phyloseq stable release <a href="http://bioconductor.org/packages/release/bioc/html/phyloseq.html">page on Bioconductor</a>.</p>
+<p>To post feature requests or ask for help, try <a href="https://github.com/joey711/phyloseq/issues">the phyloseq Issue Tracker</a>.</p>
+</div>
+<div id="introduction" class="section level1">
+<h1><span class="header-section-number">1</span> Introduction</h1>
+<p>The analysis of microbiological communities brings many challenges: the integration of many different types of data with methods from ecology, genetics, phylogenetics, network analysis, visualization and testing. The data itself may originate from widely different sources, such as the microbiomes of humans, soils, surface and ocean waters, wastewater treatment plants, industrial facilities, and so on; and as a result, these varied sample types may have very different forms and scales [...]
+</div>
+<div id="about-this-vignette" class="section level1">
+<h1><span class="header-section-number">2</span> About this vignette</h1>
+<div id="typesetting-legend" class="section level2">
+<h2><span class="header-section-number">2.1</span> Typesetting Legend <a id="sec:typeset-legend"></a></h2>
+<ul>
+<li><strong>bold</strong> - Bold is used for emphasis.</li>
+<li><em>italics</em> - Italics are used for package names, and special words, phrases.</li>
+<li><code>code font</code> - The font for code, usually courrier-like, but depends on the theme.</li>
+<li><code>myFun()</code> - Code font word with <code>()</code> attached at the right-end, is a function name.</li>
+<li><a href="#sec:typeset-legend">Hyperlink</a> - Hyperlinks are clickable text that will jump to sections and external pages.</li>
+</ul>
+</div>
+<div id="other-links-and-tutorials" class="section level2">
+<h2><span class="header-section-number">2.2</span> Other links and tutorials</h2>
+<p>An overview of phyloseq’s intended functionality, goals, and design is provided in the following free and open access article:</p>
+<p>McMurdie and Holmes (2013). <a href="http://dx.plos.org/10.1371/journal.pone.0061217">phyloseq: An R package for reproducible interactive analysis and graphics of microbiome census data</a>. PLoS ONE e61217.</p>
+<p>The most updated examples are posted in our online tutorials from <a href="http://joey711.github.com/phyloseq">the phyloseq home page</a></p>
+<p>A separate vignette describes analysis tools included in phyloseq along with various examples using included example data. A quick way to load it is:</p>
+<pre class="r"><code>vignette("phyloseq_analysis")</code></pre>
+<p>By contrast, this vignette is intended to provide functional examples of the basic data import and manipulation infrastructure included in phyloseq. This includes example code for importing OTU-clustered data from different clustering pipelines, as well as performing clear and reproducible filtering tasks that can be altered later and checked for robustness. The motivation for including tools like this in phyloseq is to save time, and also to build-in a structure that requires consist [...]
+</div>
+</div>
+<div id="phyloseq-classes" class="section level1">
+<h1><span class="header-section-number">3</span> phyloseq classes <a id="sec:app-classes"></a></h1>
+<p>The class structure in the <em>phyloseq</em> package follows the inheritance diagram shown in the figure below. Currently, <em>phyloseq</em> uses 4 core data classes. They are (1) the OTU abundance table (<code>otu_table</code>), a table of sample data (<code>sample_data</code>); (2) a table of taxonomic descriptors (<code>taxonomyTable</code>); and (3) a phylogenetic tree (<code>"phylo"</code>-class, <a href="http://cran.r-project.org/web/packages/ape/">ape package</a>.</p>
+<p>The <code>otu_table</code> class can be considered the central data type, as it directly represents the number and type of sequences observed in each sample. <code>otu_table</code> extends the numeric matrix class in the <code>R</code> base, and has a few additonal feature slots. The most important of these feature slots is the <code>taxa_are_rows</code> slot, which holds a single logical that indicates whether the table is oriented with taxa as rows (as in the <em>genefilter</em> pac [...]
+<p>The <code>sample_data</code> class directly inherits <code>R</code>’s <code>data.frame</code> class, and thus effectively stores both categorical and numerical data about each sample. The orientation of a <code>data.frame</code> in this context requires that samples/trials are rows, and variables are columns (consistent with <em>vegan</em> and other packages). The <code>taxonomyTable</code> class directly inherits the <code>matrix</code> class, and is oriented such that rows are taxa/ [...]
+<p>The phyloseq-class can be considered an “experiment-level class” and should contain two or more of the previously-described core data classes. We assume that <em>phyloseq</em> users will be interested in analyses that utilize their abundance counts derived from the phylogenetic sequencing data, and so the <code>phyloseq()</code> constructor will stop with an error if the arguments do not include an <code>otu_table</code>. There are a number of common methods that require either an <co [...]
+<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAA4QAAAG3CAYAAADsP5sRAAABL2lDQ1BJQ0MgUHJvZmlsZQAAGBljYGAycHRxcmUSYGDIzSspCnJ3UoiIjFJgv8DAwcDNIMxgzGCdmFxc4BgQ4MMABHn5eakgGhV8u8bACBK5rAsyC1WOII8ruaCoBKjqDxAbpaQWJzMwMBoA2dnlJQVAccY5QLZIUjaYvQHELgoJcgaKHwGy+dIh7CsgdhKE/QTELgJ6AqjmC0h9OpjNxAFiJ0HYMiB2SWoFyF4G5/yCyqLM9IwSBSMDAwMFx5T8pFSF4MriktTcYgXPvOT8ooL8osSS1BSgWoj7QLoYBCEKQSGmYWhpaaEJFqUiAYoHiHGfA8Hhyyh2BiGGsCi5tKgMymNkMmZgIMRHmDFHgoHBfykDA8sfhJhJLwPDAh0GBv6pCDE1QwYGAX0Ghn1zAMOvU [...]
+</div>
+<div id="load-phyloseq-and-import-data" class="section level1">
+<h1><span class="header-section-number">4</span> Load <em>phyloseq</em> and import data <a id="sec:load"></a></h1>
+<p>Now let’s get started by loading phyloseq, and describing some methods for importing data.</p>
+<div id="load-phyloseq" class="section level2">
+<h2><span class="header-section-number">4.1</span> Load <em>phyloseq</em></h2>
+<p>To use <em>phyloseq</em> in a new R session, it will have to be loaded. This can be done in your package manager, or at the command line using the <code>library()</code> command:</p>
+<pre class="r"><code>library("phyloseq")</code></pre>
+</div>
+<div id="import-data" class="section level2">
+<h2><span class="header-section-number">4.2</span> Import data</h2>
+<p>An important feature of <em>phyloseq</em> are methods for importing phylogenetic sequencing data from common taxonomic clustering pipelines. These methods take file pathnames as input, read and parse those files, and return a single object that contains all of the data.</p>
+<p>Some additional background details are provided below. The best reproducible examples on importing data with phyloseq can be found on the official data import tutorial page:</p>
+<p><a href="http://joey711.github.com/phyloseq/import-data" class="uri">http://joey711.github.com/phyloseq/import-data</a></p>
+</div>
+<div id="import-from-biom-format" class="section level2">
+<h2><span class="header-section-number">4.3</span> Import from biom-format <a id="sec:biom"></a></h2>
+<p>New versions of QIIME (see below) produce a file in <em>version 2</em> of the <a href="http://biom-format.org/">biom file format</a>, which is a specialized definition of the HDF5 format.</p>
+<p>The phyloseq package provides the <code>import_biom()</code> function, which can import both <em>Version 1</em> (JSON) and <em>Version 2</em> (HDF5) of the BIOM file format.</p>
+<p>The <em>phyloseq</em> package fully supports both taxa and sample observations of the biom format standard, and works with the BIOM files output from QIIME, RDP, MG-RAST, etc.</p>
+</div>
+<div id="import-from-qiime-modern" class="section level2">
+<h2><span class="header-section-number">4.4</span> Import from QIIME (Modern)<a id="sec:qiimeimport"></a></h2>
+<p>The default output from modern versions of QIIME is a BIOM-format file (among others). This is suppored in phyloseq.</p>
+<div id="sample-data-from-qiime" class="section level3">
+<h3><span class="header-section-number">4.4.1</span> Sample data from QIIME</h3>
+<p>Sometimes inaccurately referred to as <em>metadata</em>, additional observations on samples provided as <em>mapping file</em> to QIIME have not typically been output in the BIOM files, <strong>even though BIOM format supports it</strong>. This failure to support the full capability of the BIOM format means that you’ll have to provide sample observations as a separate file. There are many ways to do this, but the QIIME sample map is supported.</p>
+</div>
+<div id="input" class="section level3">
+<h3><span class="header-section-number">4.4.2</span> Input</h3>
+<p>Two QIIME output files (<code>.biom</code>, <code>.tre</code>) are recognized by the <code>import_biom()</code> function. One QIIME input file (sample map, tab-delimited), is recognized by the <code>import_qiime_sample_data()</code> function.</p>
+<p>The objects created by each of the import functions above should be merged using <code>merge_phyloseq</code> to create one coordinated, self-consistent object.</p>
+</div>
+<div id="output" class="section level3">
+<h3><span class="header-section-number">4.4.3</span> Output</h3>
+<ul>
+<li><strong>Before Merging</strong> - Before merging with <code>merge_phyloseq</code>, the output from these import activities is the three separate objects listed in the previous table.</li>
+<li><strong>After Merging</strong> - After merging you have a single self-consistent phyloseq object that contains an OTU table, taxonomy table, sample-data, and a phylogenetic tree.</li>
+</ul>
+</div>
+<div id="qiime-example-tutorial" class="section level3">
+<h3><span class="header-section-number">4.4.4</span> QIIME Example Tutorial</h3>
+<p>QIIME’s “Moving Pictures” example tutorial output is a little too large to include within the phyloseq package (and thus is not directly included in this vignette). However, the phyloseq home page includes a full reproducible example of the import procedure described above:</p>
+<p><strong>Link HERE</strong></p>
+<p>For reference, or if you want to try yourself, the following is the relative paths within the QIIME tutorial directory for each of the files you will need.</p>
+<ul>
+<li>BIOM file, originally at: moving_pictures_tutorial-1.9.0/illumina/precomputed-output/otus/otu_table_mc2_w_tax_no_pynast_failures.biom</li>
+<li>Tree file, originally at: moving_pictures_tutorial-1.9.0/illumina/precomputed-output/otus/rep_set.tre</li>
+<li>Map File, originally at: moving_pictures_tutorial-1.9.0/illumina/map.tsv</li>
+</ul>
+</div>
+</div>
+<div id="import-from-qiime-legacy" class="section level2">
+<h2><span class="header-section-number">4.5</span> Import from QIIME Legacy<a id="sec:qiimeimportleg"></a></h2>
+<p><a href="#cite:QIIME">QIIME</a> is a free, open-source OTU clustering and analysis pipeline written for Unix (mostly Linux). It is distributed in a number of different forms (including a pre-installed virtual machine). See <a href="http://qiime.org/">the QIIME home page</a> for details.</p>
+<div id="input-1" class="section level3">
+<h3><span class="header-section-number">4.5.1</span> Input</h3>
+<p>One QIIME input file (sample map), and two QIIME output files (<code>otu_table.txt</code>, <code>.tre</code>) are recognized by the <code>import_qiime()</code> function. Only one of the three input files is required to run, although an <code>"otu_table.txt"</code> file is required if <code>import_qiime()</code> is to return a complete experiment object.</p>
+<p>In practice, you will have to find the relevant QIIME files among a number of other files created by the QIIME pipeline. A screenshot of the directory structure created during a typical QIIME run is shown in <a href="#fig:qiimedirectory">the QIIME Directory Figure</a>.</p>
+<p><a id="fig:qiimedirectory"></a> <img src="data:image/jpeg;base64,/9j/4AAQSkZJRgABAQEASwBLAAD/4gJASUNDX1BST0ZJTEUAAQEAAAIwQURCRQIQAABtbnRyUkdCIFhZWiAH0AAIAAsAEwAzADthY3NwQVBQTAAAAABub25lAAAAAAAAAAAAAAAAAAAAAAAA9tYAAQAAAADTLUFEQkUAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAApjcHJ0AAAA/AAAADJkZXNjAAABMAAAAGt3dHB0AAABnAAAABRia3B0AAABsAAAABRyVFJDAAABxAAAAA5nVFJDAAAB1AAAAA5iVFJDAAAB5AAAAA5yWFlaAAAB9AAAABRnWFlaAAACCAAAABRiWFlaAAACHAAAABR0ZXh0AAAAAENvcHlyaWdodCAyMDAwIEFkb2Jl [...]
+</div>
+<div id="output-1" class="section level3">
+<h3><span class="header-section-number">4.5.2</span> Output</h3>
+<p>The class of the object returned by <code>import_qiime()</code> depends upon which filenames are provided. The most comprehensive class is chosen automatically, based on the input files listed as arguments. At least one argument needs to be provided.</p>
+</div>
+</div>
+<div id="import-from-mothur" class="section level2">
+<h2><span class="header-section-number">4.6</span> Import from mothur <a id="sec:mothurimport"></a></h2>
+<p>The open-source, platform-independent, locally-installed software package, <a href="#cite:Schloss:2009do">mothur</a>, can also process barcoded amplicon sequences and perform OTU-clustering. It is extensively documented on <a href="http://www.mothur.org/wiki/">the mothur wiki</a></p>
+<div id="input-2" class="section level3">
+<h3><span class="header-section-number">4.6.1</span> Input</h3>
+<p>Currently, there are three different files produced by the <em>mothur</em> package (Ver <code>1.22+</code>) that can be imported by <em>phyloseq</em>. At minimum, a user must supply a “<code>.list</code>” file, and at least one of the following two files: <code>.groups</code> or <code>.tree</code>. The group file is produced by <em>mothur</em>’s <code>make.group()</code> function. Details can be found at <a href="http://www.mothur.org/wiki/Make.group">its wiki page</a>. The tree file [...]
+</div>
+<div id="output-2" class="section level3">
+<h3><span class="header-section-number">4.6.2</span> Output</h3>
+<p>The output from <code>import_mothur()</code> depends on which file types are provided. If all three file types are provided, an instance of the phyloseq-class is returned that contains both an OTU abundance table and its associated phylogenetic tree.</p>
+</div>
+</div>
+<div id="import-from-pyrotagger" class="section level2">
+<h2><span class="header-section-number">4.7</span> Import from PyroTagger</h2>
+<p>PyroTagger is an OTU-clustering pipeline for barcoded 16S rRNA amplicon sequences, served and maintained by the Department of Energy’s (DOE’s) Joint Genome Institute (JGI). It can be used through a straightforward web interface at <a href="http://pyrotagger.jgi-psf.org/">the PyroTagger home page</a></p>
+<p>PyroTagger takes as input the untrimmed sequence (<code>.fasta</code>) and sequence-quality (<code>.qual</code>) files, as well as a sample mapping file that contains the bar code sequence for each sample and its name. It uses a 97% identity threshold for defining OTU clusters (approximately species-level of taxonomic distinction), and provides no options for specifying otherwise. It does allow users to modify the threshold setting for low-quality bases.</p>
+<div id="input-3" class="section level3">
+<h3><span class="header-section-number">4.7.1</span> Input</h3>
+<p>PyroTagger returns a single excel spreadsheet file (<code>.xls</code>) containing both abundance and taxonomy data, as well as some associated confidence information related to each taxonomic assignment. This spreadsheet also reports on potential chimeric sequences. This single output file is sufficient for <code>import_RDP_tab()</code>, provided the file has been converted to a tab-delimited plain-text format. Any spreadsheet application should suffice. No other changes should be mad [...]
+</div>
+<div id="output-3" class="section level3">
+<h3><span class="header-section-number">4.7.2</span> Output</h3>
+<p><code>import_RDP_tab()</code> returns an instance of the phyloseq-class that contains the OTU abundance table and taxonomy table. To my knowledge, PyroTagger does not calculate a tree of the representative sequences from each OTU cluster, nor a distance object, so analyses like <code>tip_glom()</code> and <code>UniFrac</code> are not applicable.</p>
+</div>
+</div>
+<div id="import-from-rdp-pipeline" class="section level2">
+<h2><span class="header-section-number">4.8</span> Import from RDP pipeline</h2>
+<p>The Ribosomal Database Project (<a href="http://rdp.cme.msu.edu/">RDP</a>) provides a web-based barcoded 16S rRNA amplicon sequence processing pipeline called the <a href="http://pyro.cme.msu.edu/">RDP Pyrosequencing Pipeline</a>. A user must run all three of the “Data Processing” steps sequentially through the web interface in order to acquire the output from Complete Linkage Clustering, the approach to OTU clustering used by the RDP Pipeline. Note that this import function assumes t [...]
+<div id="input-4" class="section level3">
+<h3><span class="header-section-number">4.8.1</span> Input</h3>
+<p>The output from the Complete Linkage Clustering, <code>.clust</code>, is the only input to the RDP pipeline importer:</p>
+<pre class="r"><code>myOTU1 <- import_RDP_cluster("path/to/my/filename.clust")</code></pre>
+</div>
+<div id="output-4" class="section level3">
+<h3><span class="header-section-number">4.8.2</span> Output</h3>
+<p>This importer returns an <code>otu_table</code> object.</p>
+</div>
+<div id="expected-naming-convention" class="section level3">
+<h3><span class="header-section-number">4.8.3</span> Expected Naming Convention</h3>
+<p>The RDP cluster pipeline (specifically, the output of the complete linkage clustering step) has no formal documentation for the “.clust” file structure or its apparent sequence naming convention.</p>
+<p>The cluster file itself contains the names of all sequences contained in the input alignment. If the upstream barcode and aligment processing steps are also done with the RDP pipeline, then the sequence names follow a predictable naming convention wherein each sequence is named by its sample and sequence ID, separated by a <code>"_"</code> as delimiter:</p>
+<p><code>sampleName_sequenceIDnumber</code></p>
+<p>This import function assumes that the sequence names in the cluster file follow this convention, and that the sample name does not contain any <code>"_"</code>. It is unlikely to work if this is not the case. It is likely to work if you used the upstream steps in the RDP pipeline to process your raw (barcoded, untrimmed) fasta/fastq data.</p>
+</div>
+</div>
+<div id="example-data-included" class="section level2">
+<h2><span class="header-section-number">4.9</span> Example Data (included)</h2>
+<p>There are multiple example data sets included in <em>phyloseq</em>. Many are from published investigations and include documentation with a summary and references, as well as some example code representing some aspect of analysis available in <em>phyloseq</em>. In the package index, go to the names beginning with “data-” to see the documentation of currently available example datasets.</p>
+<p>To load example data into the working environment, use the <code>data()</code> command:</p>
+<pre class="r"><code>data(GlobalPatterns)
+data(esophagus)
+data(enterotype)
+data(soilrep) </code></pre>
+<p>Similarly, entering <code>?enterotype</code> will reveal the documentation for the so-called “enterotype” dataset. For details examples, see <a href="http://joey711.github.io/phyloseq/Example-Data.html">the Example Data tutorial</a></p>
+</div>
+<div id="phyloseq-object-summaries" class="section level2">
+<h2><span class="header-section-number">4.10</span> phyloseq Object Summaries</h2>
+<p>In small font, the following is the summary of the <code>GlobalPatterns</code> dataset that prints to the terminal. These summaries are consistent among all <code>phyloseq-class</code> objects. Although the components of <code>GlobalPatterns</code> have many thousands of elements, the command-line returns only a short summary of each component. This encourages you to check that an object is still what you expect, without needing to let thousands of elements scroll across the terminal. [...]
+<pre class="r"><code>data(GlobalPatterns)
+GlobalPatterns</code></pre>
+<pre><code>## phyloseq-class experiment-level object
+## otu_table() OTU Table: [ 19216 taxa and 26 samples ]
+## sample_data() Sample Data: [ 26 samples by 7 sample variables ]
+## tax_table() Taxonomy Table: [ 19216 taxa by 7 taxonomic ranks ]
+## phy_tree() Phylogenetic Tree: [ 19216 tips and 19215 internal nodes ]</code></pre>
+</div>
+<div id="convert-raw-data-to-phyloseq-components" class="section level2">
+<h2><span class="header-section-number">4.11</span> Convert raw data to phyloseq components</h2>
+<p>Suppose you have already imported raw data from an experiment into <code>R</code>, and their indices are labeled correctly. How do you get <em>phyloseq</em> to recognize these tables as the appropriate class of data? And further combine them together? Table <a href="#table:build">Table of Component Constructor Functions</a> lists key functions for converting these core data formats into specific component data objects recognized by <em>phyloseq</em>. These will also</p>
+<p>Table of component constructor functions for building component data objects <a id="table:build"></a></p>
+<p>phyloseq constructors: functions for building/merging <em>phyloseq</em> objects.</p>
+<p>The following example illustrates using the constructor methods for component data tables.</p>
+<pre class="r"><code>otu1 <- otu_table(raw_abundance_matrix, taxa_are_rows=FALSE)
+sam1 <- sample_data(raw_sample_data.frame)
+tax1 <- tax_table(raw_taxonomy_matrix)
+tre1 <- read_tree(my_tree_file)</code></pre>
+</div>
+<div id="phyloseq-function-building-complex-phyloseq-objects" class="section level2">
+<h2><span class="header-section-number">4.12</span> phyloseq() function: building complex phyloseq objects</h2>
+<p>Once you’ve converted the data tables to their appropriate class, combining them into one object requires only one additional function call, <code>phyloseq()</code>:</p>
+<pre class="r"><code>ex1b <- phyloseq(my_otu_table, my_sample_data, my_taxonomyTable, my_tree)</code></pre>
+<p>You do not need to have all four data types in the example above in order to combine them into one validity-checked experiment-level phyloseq-class object. The <code>phyloseq()</code> method will detect which component data classes are present, and build accordingly. Downstream analysis methods will access the required components using <em>phyloseq</em>’s accessors, and throw an error if something is missing. For most downstream methods you will only need to supply the combined, phylo [...]
+<pre class="r"><code>ex1c <- phyloseq(my_otu_table, my_sample_data)</code></pre>
+<p>Whenever an instance of the phyloseq-class is created by <em>phyloseq</em> — for example, when we use the <code>import_qiime()</code> function to import data, or combine manually imported tables using <code>phyloseq()</code> — the row and column indices representing taxa or samples are internally checked/trimmed for compatibility, such that all component data describe exactly (and only) the same OTUs and samples.</p>
+</div>
+<div id="merge" class="section level2">
+<h2><span class="header-section-number">4.13</span> Merge</h2>
+<p>The phyloseq project includes support for two complete different categories of merging.</p>
+<ul>
+<li>Merging the OTUs or samples in a phyloseq object, based upon a taxonomic or sample variable: <code>merge_samples()</code>, <code>merge_taxa()</code></li>
+<li>Merging two or more data objects that come from the same experiment, so that their data becomes part of the same phyloseq object: <code>merge_phyloseq()</code></li>
+</ul>
+<p>For further details, see the reproducible online tutorial at:</p>
+<p><a href="http://joey711.github.com/phyloseq/merge" class="uri">http://joey711.github.com/phyloseq/merge</a></p>
+</div>
+</div>
+<div id="accessor-functions" class="section level1">
+<h1><span class="header-section-number">5</span> Accessor functions <a id="sec:accessors"></a></h1>
+<p>Once you have a phyloseq object available, many accessor functions are available to query aspects of the data set. The function name and its purpose are summarized in <a href="#table:access">the Accessor Functions Table</a>.</p>
+<p>Accessor functions for <em>phyloseq</em> objects.</p>
+<p><a id="table:access"></a></p>
+</div>
+<div id="trimming-subsetting-filtering-phyloseq-data" class="section level1">
+<h1><span class="header-section-number">6</span> Trimming, subsetting, filtering phyloseq data <a id="sec:trim"></a></h1>
+<div id="trimming-prune_taxa" class="section level2">
+<h2><span class="header-section-number">6.1</span> Trimming: prune_taxa()</h2>
+<p>Trimming high-throughput phylogenetic sequencing data can be useful, or even necessary, for certain types of analyses. However, it is important that the original data always be available for reference and reproducibility; and that the methods used for trimming be transparent to others, so they can perform the same trimming or filtering steps on the same or related data. To facilitate this, <em>phyloseq</em> contains many ways to trim/filter the data from a phylogenetic sequencing proj [...]
+<p>In general, most trimming should be accomplished using the S4 methods <code>prune_taxa()</code> or <code>prune_samples()</code>.</p>
+</div>
+<div id="simple-filtering-example" class="section level2">
+<h2><span class="header-section-number">6.2</span> Simple filtering example</h2>
+<p>For example, lets make a new object that only holds the most abundant 20 taxa in the experiment. To accomplish this, we will use the <code>prune_taxa()</code> function.</p>
+<pre class="r"><code>data(GlobalPatterns)
+most_abundant_taxa <- sort(taxa_sums(GlobalPatterns), TRUE)[1:topN]
+ex2 <- prune_taxa(names(most_abundant_taxa), GlobalPatterns)</code></pre>
+<p>Now we can ask the question, “what taxonomic Family are these OTUs?” (Subsetting still returns a <code>taxonomyTable</code> object, which is summarized. We will need to convert to a vector)</p>
+<pre class="r"><code>topFamilies <- tax_table(ex2)[, "Family"]
+as(topFamilies, "vector")</code></pre>
+<pre><code>## [1] NA "ACK-M1" "ACK-M1"
+## [4] "Bifidobacteriaceae" NA NA
+## [7] "Nostocaceae" NA "Neisseriaceae"
+## [10] "Neisseriaceae" "Pasteurellaceae" "Enterobacteriaceae"
+## [13] "Bacteroidaceae" "Bacteroidaceae" "Bacteroidaceae"
+## [16] "Clostridiaceae" "Ruminococcaceae" "Ruminococcaceae"
+## [19] "Ruminococcaceae" "Streptococcaceae"</code></pre>
+</div>
+<div id="arbitrarily-complex-abundance-filtering" class="section level2">
+<h2><span class="header-section-number">6.3</span> Arbitrarily complex abundance filtering</h2>
+<p>The previous example was a relatively simple filtering in which we kept only the most abundant 20 in the whole experiment. But what if we wanted to keep the most abundant 20 taxa of each sample? And of those, keep only the taxa that are also found in at least one-third of our samples? What if we wanted to keep only those taxa that met some across-sample criteria?</p>
+<div id="genefilter_sample-filter-by-within-sample-criteria" class="section level3">
+<h3><span class="header-section-number">6.3.1</span> genefilter_sample(): Filter by Within-Sample Criteria</h3>
+<p>For this more complicated filtering <em>phyloseq</em> contains a function, <code>genefilter_sample</code>, that takes as an argument a <em>phyloseq</em> object, as well as a list of one or more filtering functions that will be applied to each sample in the abundance matrix (<code>otu_table</code>), as well as an integer argument, <code>A</code>, that specifies for how many samples the filtering function must return <code>TRUE</code> for a particular taxa to avoid removal from the obje [...]
+<p>Here is an example on a completely fabricated <code>otu_table</code> called <code>testOTU</code>.</p>
+<pre class="r"><code>testOTU <- otu_table(matrix(sample(1:50, 25, replace=TRUE), 5, 5), taxa_are_rows=FALSE)
+f1<- filterfun_sample(topk(2))
+wh1 <- genefilter_sample(testOTU, f1, A=2)
+wh2 <- c(T, T, T, F, F)
+prune_taxa(wh1, testOTU)
+prune_taxa(wh2, testOTU)</code></pre>
+<p>Here is a second example using the included dataset, <code>GlobalPatterns</code>. The most abundant taxa are kept only if they are in the most abundant 10% of taxa in at least half of the samples in dataset <code>GlobalPatterns</code>. Note that it is not necessary to subset <code>GlobalPatterns</code> in order to do this filtering. The S4 method <code>prune_taxa</code> subsets each of the relavent component objects, and returns the complex object back.</p>
+<pre class="r"><code>data(GlobalPatterns)
+f1<- filterfun_sample(topp(0.1))
+wh1 <- genefilter_sample(GlobalPatterns, f1, A=(1/2*nsamples(GlobalPatterns)))
+sum(wh1)</code></pre>
+<pre><code>## [1] 795</code></pre>
+<pre class="r"><code>ex2 <- prune_taxa(wh1, GlobalPatterns)</code></pre>
+<pre class="r"><code>print(ex2)</code></pre>
+<pre><code>## phyloseq-class experiment-level object
+## otu_table() OTU Table: [ 795 taxa and 26 samples ]
+## sample_data() Sample Data: [ 26 samples by 7 sample variables ]
+## tax_table() Taxonomy Table: [ 795 taxa by 7 taxonomic ranks ]
+## phy_tree() Phylogenetic Tree: [ 795 tips and 794 internal nodes ]</code></pre>
+<p>If instead of the most abundant fraction of taxa, you are interested in the most abundant fraction of individuals (aka sequences, observations), then the <code>topf</code> function is appropriate. For steep rank-abundance curves, <code>topf</code> will seem to be much more conservative (trim more taxa) because it is based on the cumulative sum of relative abundance. It does not guarantee that a certain number or fraction of total taxa (richness) will be retained.</p>
+<pre class="r"><code>data(GlobalPatterns)
+f1<- filterfun_sample(topf(0.9))
+wh1 <- genefilter_sample(GlobalPatterns, f1, A=(1/3*nsamples(GlobalPatterns)))
+sum(wh1)
+prune_taxa(wh1, GlobalPatterns)</code></pre>
+</div>
+<div id="filter_taxa-filter-by-across-sample-criteria" class="section level3">
+<h3><span class="header-section-number">6.3.2</span> filter_taxa(): Filter by Across-Sample Criteria</h3>
+<p>The <code>filter_taxa</code> function is directly analogous to the <code>genefilter</code> function for microarray filtering, but is used for filtering OTUs from phyloseq objects. It applies an arbitrary set of functions – as a function list, for instance, created by <code>genefilter::filterfun</code> – as across-sample criteria, one OTU at a time. It can be thought of as an extension of the genefilter-package (from the Bioconductor repository) for phyloseq objects. It takes as input [...]
+<p>Inspect the following example. Note that the functions <code>genefilter</code> and <code>kOverA</code> are from the genefilter package.</p>
+<pre class="r"><code>data("enterotype")
+library("genefilter")
+flist<- filterfun(kOverA(5, 2e-05))
+ent.logi <- filter_taxa(enterotype, flist)
+ent.trim <- filter_taxa(enterotype, flist, TRUE)
+identical(ent.trim, prune_taxa(ent.logi, enterotype)) </code></pre>
+<pre><code>## [1] TRUE</code></pre>
+<pre class="r"><code>identical(sum(ent.logi), ntaxa(ent.trim))</code></pre>
+<pre><code>## [1] TRUE</code></pre>
+<pre class="r"><code>filter_taxa(enterotype, flist, TRUE)</code></pre>
+<pre><code>## phyloseq-class experiment-level object
+## otu_table() OTU Table: [ 416 taxa and 280 samples ]
+## sample_data() Sample Data: [ 280 samples by 9 sample variables ]
+## tax_table() Taxonomy Table: [ 416 taxa by 1 taxonomic ranks ]</code></pre>
+</div>
+</div>
+<div id="subset_samples-subset-by-sample-variables" class="section level2">
+<h2><span class="header-section-number">6.4</span> subset_samples(): Subset by Sample Variables</h2>
+<p>It is possible to subset the samples in a <em>phyloseq</em> object based on the sample variables using the <code>subset_samples()</code> function. For example to subset <code>GlobalPatterns</code> such that only certain environments are retained, the following line is needed (the related tables are subsetted automatically as well):</p>
+<pre class="r"><code>ex3 <- subset_samples(GlobalPatterns, SampleType%in%c("Freshwater", "Ocean", "Freshwater (creek)"))
+ex3</code></pre>
+<pre><code>## phyloseq-class experiment-level object
+## otu_table() OTU Table: [ 19216 taxa and 8 samples ]
+## sample_data() Sample Data: [ 8 samples by 7 sample variables ]
+## tax_table() Taxonomy Table: [ 19216 taxa by 7 taxonomic ranks ]
+## phy_tree() Phylogenetic Tree: [ 19216 tips and 19215 internal nodes ]</code></pre>
+<p>For this example only a categorical variable is shown, but in principle a continuous variable could be specified and a logical expression provided just as for the <code>subset</code> function. In fact, because <code>sample_data</code> component objects are an extension of the data.frame class, they can also be subsetted with the <code>subset</code> function:</p>
+<pre class="r"><code>subset(sample_data(GlobalPatterns), SampleType%in%c("Freshwater", "Ocean", "Freshwater (creek)"))</code></pre>
+<pre><code>## X.SampleID Primer Final_Barcode Barcode_truncated_plus_T
+## LMEpi24M LMEpi24M ILBC_13 ACACTG CAGTGT
+## SLEpi20M SLEpi20M ILBC_15 ACAGAG CTCTGT
+## AQC1cm AQC1cm ILBC_16 ACAGCA TGCTGT
+## AQC4cm AQC4cm ILBC_17 ACAGCT AGCTGT
+## AQC7cm AQC7cm ILBC_18 ACAGTG CACTGT
+## NP2 NP2 ILBC_19 ACAGTT AACTGT
+## NP3 NP3 ILBC_20 ACATCA TGATGT
+## NP5 NP5 ILBC_21 ACATGA TCATGT
+## Barcode_full_length SampleType
+## LMEpi24M CATGAACAGTG Freshwater
+## SLEpi20M AGCCGACTCTG Freshwater
+## AQC1cm GACCACTGCTG Freshwater (creek)
+## AQC4cm CAAGCTAGCTG Freshwater (creek)
+## AQC7cm ATGAAGCACTG Freshwater (creek)
+## NP2 TCGCGCAACTG Ocean
+## NP3 GCTAAGTGATG Ocean
+## NP5 GAACGATCATG Ocean
+## Description
+## LMEpi24M Lake Mendota Minnesota, 24 meter epilimnion
+## SLEpi20M Sparkling Lake Wisconsin, 20 meter eplimnion
+## AQC1cm Allequash Creek, 0-1cm depth
+## AQC4cm Allequash Creek, 3-4 cm depth
+## AQC7cm Allequash Creek, 6-7 cm depth
+## NP2 Newport Pier, CA surface water, Time 1
+## NP3 Newport Pier, CA surface water, Time 2
+## NP5 Newport Pier, CA surface water, Time 3</code></pre>
+</div>
+<div id="subset_taxa-subset-by-taxonomic-categories" class="section level2">
+<h2><span class="header-section-number">6.5</span> subset_taxa(): subset by taxonomic categories</h2>
+<p>It is possible to subset by specific taxonomic category using the <code>subset_taxa()</code> function. For example, if we wanted to subset <code>GlobalPatterns</code> so that it only contains data regarding the phylum <em>Firmicutes</em>:</p>
+<pre class="r"><code>ex4 <- subset_taxa(GlobalPatterns, Phylum=="Firmicutes")
+ex4</code></pre>
+<pre><code>## phyloseq-class experiment-level object
+## otu_table() OTU Table: [ 4356 taxa and 26 samples ]
+## sample_data() Sample Data: [ 26 samples by 7 sample variables ]
+## tax_table() Taxonomy Table: [ 4356 taxa by 7 taxonomic ranks ]
+## phy_tree() Phylogenetic Tree: [ 4356 tips and 4355 internal nodes ]</code></pre>
+</div>
+<div id="random-subsample-abundance-data" class="section level2">
+<h2><span class="header-section-number">6.6</span> random subsample abundance data</h2>
+<p>Can also randomly subset, for example a random subset of 100 taxa from the full dataset.</p>
+<pre class="r"><code>randomSpecies100 <- sample(taxa_names(GlobalPatterns), 100, replace=FALSE)
+ex5 <- prune_taxa(randomSpecies100, GlobalPatterns)</code></pre>
+</div>
+</div>
+<div id="transform-abundance-data" class="section level1">
+<h1><span class="header-section-number">7</span> Transform abundance data<a id="sec:transform"></a></h1>
+<p>Sample-wise transformation can be achieved with the <code>transform_sample_counts()</code> function. It requires two arguments, (1) the <em>phyloseq</em> object that you want to transform, and the function that you want to use to perform the transformation. Any arbitrary function can be provided as the second argument, as long as it returns a numeric vector with the same length as its input. In the following trivial example, we create a second object, <code>ex2</code>, that has been “ [...]
+<pre class="r"><code>data(GlobalPatterns)
+ex2 <- transform_sample_counts(GlobalPatterns, I)</code></pre>
+<p>For certain kinds of analyis we may want to transform the abundance data. For example, for RDA we want to transform abundance counts to within-sample ranks, and to further include a threshold beyond which all taxa receive the same rank value. The ranking for each sample is performed independently, so that the rank of a particular taxa within a particular sample is not influenced by that sample’s total quantity of sequencing relative to the other samples in the project.</p>
+<p>The following example shows how to perform such a thresholded-rank transformation of the abundance table in the complex <em>phyloseq</em> object <code>GlobalPatterns</code> with an arbitrary threshold of 500.</p>
+<pre class="r"><code>ex4<- transform_sample_counts(GlobalPatterns, threshrankfun(500))</code></pre>
+</div>
+<div id="phylogenetic-smoothing" class="section level1">
+<h1><span class="header-section-number">8</span> Phylogenetic smoothing <a id="sec:glom"></a></h1>
+<div id="tax_glom" class="section level2">
+<h2><span class="header-section-number">8.1</span> tax_glom()</h2>
+<p>Suppose we are skeptical about the importance of OTU-level distinctions in our dataset. For this scenario, <em>phyloseq</em> includes a taxonomic-agglommeration method,<code>tax_glom()</code>, which merges taxa of the same taxonomic category for a user-specified taxonomic level. In the following code, we merge all taxa of the same Genus, and store that new object as <code>ex6</code>.</p>
+<pre class="r"><code>ex6 <- tax_glom(GlobalPatterns, taxlevel="Genus")</code></pre>
+</div>
+<div id="tip_glom" class="section level2">
+<h2><span class="header-section-number">8.2</span> tip_glom()</h2>
+<p>Similarly, our original example object (<code>GlobalPatterns</code>) also contains a phlyogenetic tree corresponding to each OTU, which we could also use as a means to merge taxa in our dataset that are closely related. In this case, we specify a threshold patristic distance. Taxa more closely related than this threshold are merged. This is especially useful when a dataset has many taxa that lack a taxonomic assignment at the level you want to investigate, a problem when using <code>t [...]
+<pre class="r"><code>ex7 <- tip_glom(GlobalPatterns, speciationMinLength = 0.05)</code></pre>
+<p>Command output not provided here to save time during compilation of the vignette. The user is encouraged to try this out on your dataset, or even this example, if interested. It may take a while to run on the full, untrimmed data.</p>
+</div>
+</div>
+<div id="installation" class="section level1">
+<h1><span class="header-section-number">9</span> Installation</h1>
+<div id="installation-1" class="section level2">
+<h2><span class="header-section-number">9.1</span> Installation</h2>
+<p>Please check <a href="http://joey711.github.com/phyloseq/install">the phyloseq installation tutorial</a> for help with installation. This is likely to be the first place news and updated information about installation will be posted, as well. Also check out the rest of <a href="http://joey711.github.io/phyloseq/">the phyloseq homepage on GitHub</a>, as this is the best place to post issues, bug reports, feature requests, contribute code, etc.</p>
+</div>
+<div id="installing-parallel-backend" class="section level2">
+<h2><span class="header-section-number">9.2</span> Installing Parallel Backend</h2>
+<p>For running parallel implementation of functions/methods in <em>phyloseq</em> (e.g. <code>UniFrac(GlobalPatterns, parallel=TRUE)</code>), you will need also to install a function for registering a parallel “backend”. Only one working parallel backend is needed, but there are several options, and the best one will depend on the details of your particular system. The “doParallel” package is a good place to start. Any one of the following lines from an <code>R</code> session will install [...]
+<pre class="r"><code>install.packages("doParallel")
+install.packages("doMC")
+install.packages("doSNOW")
+install.packages("doMPI")</code></pre>
+</div>
+</div>
+<div id="references" class="section level1">
+<h1><span class="header-section-number">10</span> References</h1>
+<p><a id="cite:bioconductor"></a> Robert C Gentleman, Vincent J. Carey, Douglas M. Bates, et al. <strong>Bioconductor: Open software development for computational biology and bioinformatics.</strong> <em>Genome Biology</em> 5:R80, 2004.</p>
+<p><a id="cite:QIIME"></a> J Gregory Caporaso, Justin Kuczynski, Jesse Stombaugh, Kyle Bittinger, Frederic D Bushman <strong>QIIME allows analysis of high-throughput community sequencing data.</strong> <em>Nature Methods</em> 7(5):335-336, 2010.</p>
+<p><a id="cite:Schloss:2009do"></a> P D Schloss, S L Westcott, T Ryabin, J R Hall, M Hartmann, et al. <strong>Introducing mothur: Open-Source, Platform-Independent, Community-Supported Software for Describing and Comparing Microbial Communities.</strong> <em>Applied and Environmental Microbiology</em> 75(23):7537-7541, 2009.</p>
+<p><a id="cite:RDP"></a> J R Cole, Q Wang, E Cardenas, J Fish, B Chai et al. <strong>The Ribosomal Database Project: improved alignments and new tools for rRNA analysis.</strong> <em>Nucleic Acids Research</em> 37(Database issue):D141-5, 2009.</p>
+</div>
+
+
+
+<!-- dynamically load mathjax for compatibility with self-contained -->
+<script>
+ (function () {
+ var script = document.createElement("script");
+ script.type = "text/javascript";
+ script.src = "https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
+ document.getElementsByTagName("head")[0].appendChild(script);
+ })();
+</script>
+
+</body>
+</html>
diff --git a/inst/doc/phyloseq-mixture-models.R b/inst/doc/phyloseq-mixture-models.R
new file mode 100644
index 0000000..ff73e74
--- /dev/null
+++ b/inst/doc/phyloseq-mixture-models.R
@@ -0,0 +1,74 @@
+## ----load-phyloseq, message=FALSE, warning=FALSE-------------------------
+library("phyloseq"); packageVersion("phyloseq")
+
+## ----filepath------------------------------------------------------------
+filepath = system.file("extdata", "study_1457_split_library_seqs_and_mapping.zip", package="phyloseq")
+kostic = microbio_me_qiime(filepath)
+
+## ----example-path-local, eval=FALSE--------------------------------------
+# filepath = "~/Downloads/study_1457_split_library_seqs_and_mapping.zip"
+# kostic = microbio_me_qiime(filepath)
+
+## ----example-path-remote, eval=FALSE-------------------------------------
+# kostic = microbio_me_qiime(1457)
+
+## ----show-variables------------------------------------------------------
+kostic
+head(sample_data(kostic)$DIAGNOSIS, 10)
+
+## ----deseq2, message=FALSE, warning=FALSE--------------------------------
+library("DESeq2"); packageVersion("DESeq2")
+
+## ----rm-bad-samples------------------------------------------------------
+kostic <- subset_samples(kostic, DIAGNOSIS != "None")
+kostic <- prune_samples(sample_sums(kostic) > 500, kostic)
+kostic
+
+## ----run-deseq2----------------------------------------------------------
+diagdds = phyloseq_to_deseq2(kostic, ~ DIAGNOSIS)
+# calculate geometric means prior to estimate size factors
+gm_mean = function(x, na.rm=TRUE){
+ exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x))
+}
+geoMeans = apply(counts(diagdds), 1, gm_mean)
+diagdds = estimateSizeFactors(diagdds, geoMeans = geoMeans)
+diagdds = DESeq(diagdds, fitType="local")
+
+## ----grab-results-process-table------------------------------------------
+res = results(diagdds)
+res = res[order(res$padj, na.last=NA), ]
+alpha = 0.01
+sigtab = res[(res$padj < alpha), ]
+sigtab = cbind(as(sigtab, "data.frame"), as(tax_table(kostic)[rownames(sigtab), ], "matrix"))
+head(sigtab)
+
+## ----table-prelim--------------------------------------------------------
+posigtab = sigtab[sigtab[, "log2FoldChange"] > 0, ]
+posigtab = posigtab[, c("baseMean", "log2FoldChange", "lfcSE", "padj", "Phylum", "Class", "Family", "Genus")]
+
+## ----make-markdown-table, echo=FALSE, results='asis'---------------------
+# Make a markdown table
+posigtab = data.frame(OTU=rownames(posigtab), posigtab)
+cat(paste(colnames(posigtab), collapse=" | "), fill=TRUE)
+cat(paste(rep("---", times=ncol(posigtab)), collapse=" | "), fill=TRUE)
+dummy = apply(posigtab, 1, function(x){
+ cat(paste(x, collapse=" | "), fill=TRUE)
+})
+
+## ----bar-plot------------------------------------------------------------
+library("ggplot2")
+theme_set(theme_bw())
+sigtabgen = subset(sigtab, !is.na(Genus))
+# Phylum order
+x = tapply(sigtabgen$log2FoldChange, sigtabgen$Phylum, function(x) max(x))
+x = sort(x, TRUE)
+sigtabgen$Phylum = factor(as.character(sigtabgen$Phylum), levels=names(x))
+# Genus order
+x = tapply(sigtabgen$log2FoldChange, sigtabgen$Genus, function(x) max(x))
+x = sort(x, TRUE)
+sigtabgen$Genus = factor(as.character(sigtabgen$Genus), levels=names(x))
+ggplot(sigtabgen, aes(y=Genus, x=log2FoldChange, color=Phylum)) +
+ geom_vline(xintercept = 0.0, color = "gray", size = 0.5) +
+ geom_point(size=6) +
+ theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))
+
diff --git a/inst/doc/phyloseq-mixture-models.Rmd b/inst/doc/phyloseq-mixture-models.Rmd
new file mode 100644
index 0000000..3e3b03e
--- /dev/null
+++ b/inst/doc/phyloseq-mixture-models.Rmd
@@ -0,0 +1,191 @@
+---
+title: "Example using Negative Binomial in Microbiome Differential Abundance Testing"
+output:
+ BiocStyle::html_document:
+ fig_height: 7
+ fig_width: 10
+ toc: yes
+ toc_depth: 2
+ number_sections: true
+---
+<!--
+%% \VignetteEngine{knitr::rmarkdown}
+%% \VignetteIndexEntry{phyloseq and DESeq2 on Colorectal Cancer Data}
+-->
+
+`r library("knitr")`
+`r opts_chunk$set(cache=FALSE, fig.width=9, message=FALSE, warning=FALSE)`
+
+Paul J. McMurdie and Susan Holmes
+
+<mcmurdie at stanford.edu>
+
+[phyloseq Home Page](http://joey711.github.io/phyloseq/)
+
+If you find phyloseq and/or its tutorials useful, please acknowledge and cite phyloseq in your publications:
+
+**phyloseq: An R package for reproducible interactive analysis and graphics of microbiome census data** (2013) PLoS ONE 8(4):e61217
+http://dx.plos.org/10.1371/journal.pone.0061217
+
+# Other resources
+The phyloseq project also has a number of supporting online resources, most of which can by found at [the phyloseq home page](http://joey711.github.com/phyloseq/), or from the phyloseq stable release [page on Bioconductor](http://bioconductor.org/packages/release/bioc/html/phyloseq.html).
+
+To post feature requests or ask for help, try [the phyloseq Issue Tracker](https://github.com/joey711/phyloseq/issues).
+
+
+# The experimental data used in this example
+
+In this example I use the publicly available data from a study on colorectal cancer:
+
+[Genomic analysis identifies association of Fusobacterium with colorectal carcinoma](http://genome.cshlp.org/content/22/2/292.long).
+Kostic, A. D., Gevers, D., Pedamallu, C. S., Michaud, M., Duke, F., Earl, A. M., et al. (2012). *Genome research*, 22(2), 292-298.
+
+As a side-note, this work was published ahead of print in [Genome Research](http://genome.cshlp.org/) alongside a highly-related article from a separate group of researchers (long-live reproducible observations!): [Fusobacterium nucleatum infection is prevalent in human colorectal carcinoma](http://genome.cshlp.org/content/22/2/299.long). In case you are interested. For the purposes of example, however, we will stick to the data from the former study, with data available at the [microbio [...]
+
+Data source, from methods section in article:
+
+> The 16S gene data set consists of 454 FLX Titanium sequences spanning the V3 to V5 variable regions obtained for 190 samples (95 pairs). Detailed protocols used for 16S amplification and se- quencing are available on the HMP Data Analysis and Coordination Center website (http://www.hmpdacc.org/tools_protocols/tools_ protocols.php).
+
+Study ID: `1457`
+
+Project Name: `Kostic_colorectal_cancer_fusobacterium`
+
+Study Abstract:
+
+> The tumor microenvironment of colorectal carcinoma is a complex community of genomically altered cancer cells, nonneoplastic cells, and a diverse collection of microorganisms. Each of these components may contribute to carcino genesis; however, the role of the microbiota is the least well understood. We have characterized the composition of the microbiota in colorectal carcinoma using whole genome sequences from nine tumor/normal pairs. Fusobacterium sequences were enriched in carcinom [...]
+
+# Import data with phyloseq, convert to DESeq2
+
+Start by loading phyloseq.
+
+```{r load-phyloseq, message=FALSE, warning=FALSE}
+library("phyloseq"); packageVersion("phyloseq")
+```
+
+Defined file path, and import the published OTU count data into R.
+
+```{r filepath}
+filepath = system.file("extdata", "study_1457_split_library_seqs_and_mapping.zip", package="phyloseq")
+kostic = microbio_me_qiime(filepath)
+```
+
+Here I had to use a relative file path so that this example works on all systems that have phyloseq installed. In practice, your file path will look like this (if you've downloaded the data ahead of time):
+
+```{r example-path-local, eval=FALSE}
+filepath = "~/Downloads/study_1457_split_library_seqs_and_mapping.zip"
+kostic = microbio_me_qiime(filepath)
+```
+
+Or like this (if you're accessing data directly from the microbio.me/qiime server directly):
+
+```{r example-path-remote, eval=FALSE}
+kostic = microbio_me_qiime(1457)
+```
+
+
+# Convert to DESeq2's DESeqDataSet class
+
+In this example I'm using the major sample covariate, `DIAGNOSIS`, as the study design factor. The focus of this study was to compare the microbiomes of pairs of healthy and cancerous tissues, so this makes sense. Your study could have a more complex or nested design, and you should think carefully about the study design formula, because this is critical to the test results and their meaning. You might even need to define a new factor if none of the variables in your current table approp [...]
+
+Here is the summary of the data variable `kostic` that we are about to use, as well as the first few entries of the `DIAGNOSIS` factor.
+```{r show-variables}
+kostic
+head(sample_data(kostic)$DIAGNOSIS, 10)
+```
+
+# DESeq2 conversion and call
+
+First load DESeq2.
+
+```{r deseq2, message=FALSE, warning=FALSE}
+library("DESeq2"); packageVersion("DESeq2")
+```
+
+The following two lines actually do all the complicated DESeq2 work. The function `phyloseq_to_deseq2` converts your phyloseq-format microbiome data into a `DESeqDataSet` with dispersions estimated, using the experimental design formula, also shown (the `~DIAGNOSIS` term). The `DESeq` function does the rest of the testing, in this case with default testing framework, but you can actually use alternatives.
+
+First remove the 5 samples that had no `DIAGNOSIS` attribute assigned.
+These introduce a spurious third design class
+that is actually a rare artifact in the dataset.
+Also remove samples with less than `500` reads (counts).
+Note that this kind of data cleanup
+is useful, necessary, and should be well-documented
+because it can also be dangerous to alter or omit data
+without clear documentation.
+In this case I actually explored the data first,
+and am omitting some of the details
+(and explanatory plots) here for clarity.
+
+```{r rm-bad-samples}
+kostic <- subset_samples(kostic, DIAGNOSIS != "None")
+kostic <- prune_samples(sample_sums(kostic) > 500, kostic)
+kostic
+```
+
+
+```{r run-deseq2}
+diagdds = phyloseq_to_deseq2(kostic, ~ DIAGNOSIS)
+# calculate geometric means prior to estimate size factors
+gm_mean = function(x, na.rm=TRUE){
+ exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x))
+}
+geoMeans = apply(counts(diagdds), 1, gm_mean)
+diagdds = estimateSizeFactors(diagdds, geoMeans = geoMeans)
+diagdds = DESeq(diagdds, fitType="local")
+```
+Note: The default multiple-inference correction is Benjamini-Hochberg, and occurs within the `DESeq` function.
+
+
+# Investigate test results table
+
+The following `results` function call creates a table of the results of the tests. Very fast. The hard work was already stored with the rest of the DESeq2-related data in our latest version of the `diagdds` object (see above). I then order by the adjusted p-value, removing the entries with an `NA` value. The rest of this example is just formatting the results table with taxonomic information for nice(ish) display in the HTML output.
+
+```{r grab-results-process-table}
+res = results(diagdds)
+res = res[order(res$padj, na.last=NA), ]
+alpha = 0.01
+sigtab = res[(res$padj < alpha), ]
+sigtab = cbind(as(sigtab, "data.frame"), as(tax_table(kostic)[rownames(sigtab), ], "matrix"))
+head(sigtab)
+```
+
+Let's look at just the OTUs that were significantly enriched in the carcinoma tissue. First, cleaning up the table a little for legibility.
+
+```{r table-prelim}
+posigtab = sigtab[sigtab[, "log2FoldChange"] > 0, ]
+posigtab = posigtab[, c("baseMean", "log2FoldChange", "lfcSE", "padj", "Phylum", "Class", "Family", "Genus")]
+```
+```{r make-markdown-table, echo=FALSE, results='asis'}
+# Make a markdown table
+posigtab = data.frame(OTU=rownames(posigtab), posigtab)
+cat(paste(colnames(posigtab), collapse=" | "), fill=TRUE)
+cat(paste(rep("---", times=ncol(posigtab)), collapse=" | "), fill=TRUE)
+dummy = apply(posigtab, 1, function(x){
+ cat(paste(x, collapse=" | "), fill=TRUE)
+})
+```
+
+As expected from the original study abstract and title, a *Fusobacterium* OTU was among the most-significantly differentially abundant between the cancerous and healthy samples.
+
+
+# Plot Results
+
+Here is a bar plot showing the log2-fold-change, showing Genus and Phylum. Uses some ggplot2 commands.
+
+```{r bar-plot}
+library("ggplot2")
+theme_set(theme_bw())
+sigtabgen = subset(sigtab, !is.na(Genus))
+# Phylum order
+x = tapply(sigtabgen$log2FoldChange, sigtabgen$Phylum, function(x) max(x))
+x = sort(x, TRUE)
+sigtabgen$Phylum = factor(as.character(sigtabgen$Phylum), levels=names(x))
+# Genus order
+x = tapply(sigtabgen$log2FoldChange, sigtabgen$Genus, function(x) max(x))
+x = sort(x, TRUE)
+sigtabgen$Genus = factor(as.character(sigtabgen$Genus), levels=names(x))
+ggplot(sigtabgen, aes(y=Genus, x=log2FoldChange, color=Phylum)) +
+ geom_vline(xintercept = 0.0, color = "gray", size = 0.5) +
+ geom_point(size=6) +
+ theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))
+```
+
diff --git a/inst/doc/phyloseq-mixture-models.html b/inst/doc/phyloseq-mixture-models.html
new file mode 100644
index 0000000..7f6c623
--- /dev/null
+++ b/inst/doc/phyloseq-mixture-models.html
@@ -0,0 +1,302 @@
+<!DOCTYPE html>
+
+<html xmlns="http://www.w3.org/1999/xhtml">
+
+<head>
+
+<meta charset="utf-8">
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+<meta name="generator" content="pandoc" />
+
+
+
+<title>Example using Negative Binomial in Microbiome Differential Abundance Testing</title>
+
+<link href="data:text/css;charset=utf-8,pre%20%2Eoperator%2C%0Apre%20%2Eparen%20%7B%0Acolor%3A%20rgb%28104%2C%20118%2C%20135%29%0A%7D%0Apre%20%2Eliteral%20%7B%0Acolor%3A%20%23990073%0A%7D%0Apre%20%2Enumber%20%7B%0Acolor%3A%20%23099%3B%0A%7D%0Apre%20%2Ecomment%20%7B%0Acolor%3A%20%23998%3B%0Afont%2Dstyle%3A%20italic%0A%7D%0Apre%20%2Ekeyword%20%7B%0Acolor%3A%20%23900%3B%0Afont%2Dweight%3A%20bold%0A%7D%0Apre%20%2Eidentifier%20%7B%0Acolor%3A%20rgb%280%2C%200%2C%200%29%3B%0A%7D%0Apre%20%2Estri [...]
+<script src="data:application/x-javascript;base64,dmFyIGhsanM9bmV3IGZ1bmN0aW9uKCl7ZnVuY3Rpb24gbShwKXtyZXR1cm4gcC5yZXBsYWNlKC8mL2dtLCImYW1wOyIpLnJlcGxhY2UoLzwvZ20sIiZsdDsiKX1mdW5jdGlvbiBmKHIscSxwKXtyZXR1cm4gUmVnRXhwKHEsIm0iKyhyLmNJPyJpIjoiIikrKHA/ImciOiIiKSl9ZnVuY3Rpb24gYihyKXtmb3IodmFyIHA9MDtwPHIuY2hpbGROb2Rlcy5sZW5ndGg7cCsrKXt2YXIgcT1yLmNoaWxkTm9kZXNbcF07aWYocS5ub2RlTmFtZT09IkNPREUiKXtyZXR1cm4gcX1pZighKHEubm9kZVR5cGU9PTMmJnEubm9kZVZhbHVlLm1hdGNoKC9ccysvKSkpe2JyZWFrfX19ZnVuY3Rpb24gaCh0LH [...]
+
+<style type="text/css">code{white-space: pre;}</style>
+<style type="text/css">
+ pre:not([class]) {
+ background-color: white;
+ }
+</style>
+<script type="text/javascript">
+if (window.hljs && document.readyState && document.readyState === "complete") {
+ window.setTimeout(function() {
+ hljs.initHighlighting();
+ }, 0);
+}
+</script>
+
+
+<link href="data:text/css;charset=utf-8,body%2C%20td%20%7B%0Afont%2Dfamily%3A%20sans%2Dserif%3B%0Abackground%2Dcolor%3A%20white%3B%0Afont%2Dsize%3A%2013px%3B%0A%7D%0Abody%20%7B%0Amax%2Dwidth%3A%20800px%3B%0Amargin%3A%200%20auto%3B%0Apadding%3A%201em%201em%202em%3B%0Aline%2Dheight%3A%2020px%3B%0A%7D%0A%0Adiv%23TOC%20li%20%7B%0Alist%2Dstyle%3Anone%3B%0Abackground%2Dimage%3Anone%3B%0Abackground%2Drepeat%3Anone%3B%0Abackground%2Dposition%3A0%3B%0A%7D%0A%0Ap%2C%20pre%20%7B%20margin%3A%200em%2 [...]
+
+<script type="text/javascript">
+document.addEventListener("DOMContentLoaded", function() {
+ var links = document.links;
+ for (var i = 0, linksLength = links.length; i < linksLength; i++)
+ if(links[i].hostname != window.location.hostname)
+ links[i].target = '_blank';
+});
+</script>
+
+</head>
+
+<body>
+
+
+<div id="header">
+<h1 class="title">Example using Negative Binomial in Microbiome Differential Abundance Testing</h1>
+</div>
+
+<h1>Contents</h1>
+<div id="TOC">
+<ul>
+<li><a href="#other-resources"><span class="toc-section-number">1</span> Other resources</a></li>
+<li><a href="#the-experimental-data-used-in-this-example"><span class="toc-section-number">2</span> The experimental data used in this example</a></li>
+<li><a href="#import-data-with-phyloseq-convert-to-deseq2"><span class="toc-section-number">3</span> Import data with phyloseq, convert to DESeq2</a></li>
+<li><a href="#convert-to-deseq2s-deseqdataset-class"><span class="toc-section-number">4</span> Convert to DESeq2’s DESeqDataSet class</a></li>
+<li><a href="#deseq2-conversion-and-call"><span class="toc-section-number">5</span> DESeq2 conversion and call</a></li>
+<li><a href="#investigate-test-results-table"><span class="toc-section-number">6</span> Investigate test results table</a></li>
+<li><a href="#plot-results"><span class="toc-section-number">7</span> Plot Results</a></li>
+</ul>
+</div>
+
+<!--
+%% \VignetteEngine{knitr::rmarkdown}
+%% \VignetteIndexEntry{phyloseq and DESeq2 on Colorectal Cancer Data}
+-->
+<p>Paul J. McMurdie and Susan Holmes</p>
+<p><a href="mailto:mcmurdie at stanford.edu">mcmurdie at stanford.edu</a></p>
+<p><a href="http://joey711.github.io/phyloseq/">phyloseq Home Page</a></p>
+<p>If you find phyloseq and/or its tutorials useful, please acknowledge and cite phyloseq in your publications:</p>
+<p><strong>phyloseq: An R package for reproducible interactive analysis and graphics of microbiome census data</strong> (2013) PLoS ONE 8(4):e61217 <a href="http://dx.plos.org/10.1371/journal.pone.0061217" class="uri">http://dx.plos.org/10.1371/journal.pone.0061217</a></p>
+<div id="other-resources" class="section level1">
+<h1><span class="header-section-number">1</span> Other resources</h1>
+<p>The phyloseq project also has a number of supporting online resources, most of which can by found at <a href="http://joey711.github.com/phyloseq/">the phyloseq home page</a>, or from the phyloseq stable release <a href="http://bioconductor.org/packages/release/bioc/html/phyloseq.html">page on Bioconductor</a>.</p>
+<p>To post feature requests or ask for help, try <a href="https://github.com/joey711/phyloseq/issues">the phyloseq Issue Tracker</a>.</p>
+</div>
+<div id="the-experimental-data-used-in-this-example" class="section level1">
+<h1><span class="header-section-number">2</span> The experimental data used in this example</h1>
+<p>In this example I use the publicly available data from a study on colorectal cancer:</p>
+<p><a href="http://genome.cshlp.org/content/22/2/292.long">Genomic analysis identifies association of Fusobacterium with colorectal carcinoma</a>. Kostic, A. D., Gevers, D., Pedamallu, C. S., Michaud, M., Duke, F., Earl, A. M., et al. (2012). <em>Genome research</em>, 22(2), 292-298.</p>
+<p>As a side-note, this work was published ahead of print in <a href="http://genome.cshlp.org/">Genome Research</a> alongside a highly-related article from a separate group of researchers (long-live reproducible observations!): <a href="http://genome.cshlp.org/content/22/2/299.long">Fusobacterium nucleatum infection is prevalent in human colorectal carcinoma</a>. In case you are interested. For the purposes of example, however, we will stick to the data from the former study, with data a [...]
+<p>Data source, from methods section in article:</p>
+<blockquote>
+<p>The 16S gene data set consists of 454 FLX Titanium sequences spanning the V3 to V5 variable regions obtained for 190 samples (95 pairs). Detailed protocols used for 16S amplification and se- quencing are available on the HMP Data Analysis and Coordination Center website (<a href="http://www.hmpdacc.org/tools_protocols/tools_" class="uri">http://www.hmpdacc.org/tools_protocols/tools_</a> protocols.php).</p>
+</blockquote>
+<p>Study ID: <code>1457</code></p>
+<p>Project Name: <code>Kostic_colorectal_cancer_fusobacterium</code></p>
+<p>Study Abstract:</p>
+<blockquote>
+<p>The tumor microenvironment of colorectal carcinoma is a complex community of genomically altered cancer cells, nonneoplastic cells, and a diverse collection of microorganisms. Each of these components may contribute to carcino genesis; however, the role of the microbiota is the least well understood. We have characterized the composition of the microbiota in colorectal carcinoma using whole genome sequences from nine tumor/normal pairs. Fusobacterium sequences were enriched in carcino [...]
+</blockquote>
+</div>
+<div id="import-data-with-phyloseq-convert-to-deseq2" class="section level1">
+<h1><span class="header-section-number">3</span> Import data with phyloseq, convert to DESeq2</h1>
+<p>Start by loading phyloseq.</p>
+<pre class="r"><code>library("phyloseq"); packageVersion("phyloseq")</code></pre>
+<pre><code>## [1] '1.19.1'</code></pre>
+<p>Defined file path, and import the published OTU count data into R.</p>
+<pre class="r"><code>filepath = system.file("extdata", "study_1457_split_library_seqs_and_mapping.zip", package="phyloseq")
+kostic = microbio_me_qiime(filepath)</code></pre>
+<pre><code>## Found biom-format file, now parsing it...
+## Done parsing biom...
+## Importing Sample Metdadata from mapping file...
+## Merging the imported objects...
+## Successfully merged, phyloseq-class created.
+## Returning...</code></pre>
+<p>Here I had to use a relative file path so that this example works on all systems that have phyloseq installed. In practice, your file path will look like this (if you’ve downloaded the data ahead of time):</p>
+<pre class="r"><code>filepath = "~/Downloads/study_1457_split_library_seqs_and_mapping.zip"
+kostic = microbio_me_qiime(filepath)</code></pre>
+<p>Or like this (if you’re accessing data directly from the microbio.me/qiime server directly):</p>
+<pre class="r"><code>kostic = microbio_me_qiime(1457)</code></pre>
+</div>
+<div id="convert-to-deseq2s-deseqdataset-class" class="section level1">
+<h1><span class="header-section-number">4</span> Convert to DESeq2’s DESeqDataSet class</h1>
+<p>In this example I’m using the major sample covariate, <code>DIAGNOSIS</code>, as the study design factor. The focus of this study was to compare the microbiomes of pairs of healthy and cancerous tissues, so this makes sense. Your study could have a more complex or nested design, and you should think carefully about the study design formula, because this is critical to the test results and their meaning. You might even need to define a new factor if none of the variables in your curren [...]
+<p>Here is the summary of the data variable <code>kostic</code> that we are about to use, as well as the first few entries of the <code>DIAGNOSIS</code> factor.</p>
+<pre class="r"><code>kostic</code></pre>
+<pre><code>## phyloseq-class experiment-level object
+## otu_table() OTU Table: [ 2505 taxa and 190 samples ]
+## sample_data() Sample Data: [ 190 samples by 71 sample variables ]
+## tax_table() Taxonomy Table: [ 2505 taxa by 7 taxonomic ranks ]</code></pre>
+<pre class="r"><code>head(sample_data(kostic)$DIAGNOSIS, 10)</code></pre>
+<pre><code>## [1] Healthy Tumor Tumor Healthy Healthy Healthy Tumor Healthy
+## [9] Healthy Healthy
+## Levels: Healthy None Tumor</code></pre>
+</div>
+<div id="deseq2-conversion-and-call" class="section level1">
+<h1><span class="header-section-number">5</span> DESeq2 conversion and call</h1>
+<p>First load DESeq2.</p>
+<pre class="r"><code>library("DESeq2"); packageVersion("DESeq2")</code></pre>
+<pre><code>## [1] '1.14.1'</code></pre>
+<p>The following two lines actually do all the complicated DESeq2 work. The function <code>phyloseq_to_deseq2</code> converts your phyloseq-format microbiome data into a <code>DESeqDataSet</code> with dispersions estimated, using the experimental design formula, also shown (the <code>~DIAGNOSIS</code> term). The <code>DESeq</code> function does the rest of the testing, in this case with default testing framework, but you can actually use alternatives.</p>
+<p>First remove the 5 samples that had no <code>DIAGNOSIS</code> attribute assigned. These introduce a spurious third design class that is actually a rare artifact in the dataset. Also remove samples with less than <code>500</code> reads (counts). Note that this kind of data cleanup is useful, necessary, and should be well-documented because it can also be dangerous to alter or omit data without clear documentation. In this case I actually explored the data first, and am omitting some of [...]
+<pre class="r"><code>kostic <- subset_samples(kostic, DIAGNOSIS != "None")
+kostic <- prune_samples(sample_sums(kostic) > 500, kostic)
+kostic</code></pre>
+<pre><code>## phyloseq-class experiment-level object
+## otu_table() OTU Table: [ 2505 taxa and 177 samples ]
+## sample_data() Sample Data: [ 177 samples by 71 sample variables ]
+## tax_table() Taxonomy Table: [ 2505 taxa by 7 taxonomic ranks ]</code></pre>
+<pre class="r"><code>diagdds = phyloseq_to_deseq2(kostic, ~ DIAGNOSIS)
+# calculate geometric means prior to estimate size factors
+gm_mean = function(x, na.rm=TRUE){
+ exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x))
+}
+geoMeans = apply(counts(diagdds), 1, gm_mean)
+diagdds = estimateSizeFactors(diagdds, geoMeans = geoMeans)
+diagdds = DESeq(diagdds, fitType="local")</code></pre>
+<p>Note: The default multiple-inference correction is Benjamini-Hochberg, and occurs within the <code>DESeq</code> function.</p>
+</div>
+<div id="investigate-test-results-table" class="section level1">
+<h1><span class="header-section-number">6</span> Investigate test results table</h1>
+<p>The following <code>results</code> function call creates a table of the results of the tests. Very fast. The hard work was already stored with the rest of the DESeq2-related data in our latest version of the <code>diagdds</code> object (see above). I then order by the adjusted p-value, removing the entries with an <code>NA</code> value. The rest of this example is just formatting the results table with taxonomic information for nice(ish) display in the HTML output.</p>
+<pre class="r"><code>res = results(diagdds)
+res = res[order(res$padj, na.last=NA), ]
+alpha = 0.01
+sigtab = res[(res$padj < alpha), ]
+sigtab = cbind(as(sigtab, "data.frame"), as(tax_table(kostic)[rownames(sigtab), ], "matrix"))
+head(sigtab)</code></pre>
+<pre><code>## baseMean log2FoldChange lfcSE stat pvalue
+## 64396 123.230731 2.095439 0.4087753 5.126137 2.957468e-07
+## 72853 19.433725 -1.560522 0.3212705 -4.857346 1.189696e-06
+## 180285 114.266166 -1.180600 0.2724565 -4.333171 1.469771e-05
+## 194648 6.191726 -1.237142 0.3010604 -4.109280 3.968938e-05
+## 184729 9.837951 -1.788234 0.4280126 -4.177994 2.940913e-05
+## 374052 42.072484 2.039744 0.4978723 4.096923 4.186780e-05
+## padj Kingdom Phylum Class
+## 64396 0.0001076518 Bacteria Fusobacteria Fusobacteria (class)
+## 72853 0.0002165246 Bacteria Firmicutes Clostridia
+## 180285 0.0017833223 Bacteria Firmicutes Clostridia
+## 194648 0.0025399798 Bacteria Firmicutes Clostridia
+## 184729 0.0025399798 Bacteria Firmicutes Clostridia
+## 374052 0.0025399798 Bacteria Fusobacteria Fusobacteria (class)
+## Order Family Genus
+## 64396 Fusobacteriales Fusobacteriaceae Fusobacterium
+## 72853 Clostridiales Ruminococcaceae Faecalibacterium
+## 180285 Clostridiales Ruminococcaceae Faecalibacterium
+## 194648 Clostridiales <NA> <NA>
+## 184729 Clostridiales Lachnospiraceae Ruminococcus
+## 374052 Fusobacteriales Fusobacteriaceae Fusobacterium
+## Species
+## 64396 <NA>
+## 72853 <NA>
+## 180285 <NA>
+## 194648 <NA>
+## 184729 Ruminococcus gnavus
+## 374052 <NA></code></pre>
+<p>Let’s look at just the OTUs that were significantly enriched in the carcinoma tissue. First, cleaning up the table a little for legibility.</p>
+<pre class="r"><code>posigtab = sigtab[sigtab[, "log2FoldChange"] > 0, ]
+posigtab = posigtab[, c("baseMean", "log2FoldChange", "lfcSE", "padj", "Phylum", "Class", "Family", "Genus")]</code></pre>
+<table style="width:50%;">
+<colgroup>
+<col width="5%"></col>
+<col width="5%"></col>
+<col width="5%"></col>
+<col width="5%"></col>
+<col width="5%"></col>
+<col width="5%"></col>
+<col width="5%"></col>
+<col width="5%"></col>
+<col width="5%"></col>
+</colgroup>
+<thead>
+<tr class="header">
+<th>OTU</th>
+<th>baseMean</th>
+<th>log2FoldChange</th>
+<th>lfcSE</th>
+<th>padj</th>
+<th>Phylum</th>
+<th>Class</th>
+<th>Family</th>
+<th>Genus</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td>64396</td>
+<td>123.230731</td>
+<td>2.095439</td>
+<td>0.4087753</td>
+<td>0.0001076518</td>
+<td>Fusobacteria</td>
+<td>Fusobacteria (class)</td>
+<td>Fusobacteriaceae</td>
+<td>Fusobacterium</td>
+</tr>
+<tr class="even">
+<td>374052</td>
+<td>42.072484</td>
+<td>2.039744</td>
+<td>0.4978723</td>
+<td>0.0025399798</td>
+<td>Fusobacteria</td>
+<td>Fusobacteria (class)</td>
+<td>Fusobacteriaceae</td>
+<td>Fusobacterium</td>
+</tr>
+<tr class="odd">
+<td>307981</td>
+<td>1.910564</td>
+<td>1.773142</td>
+<td>0.4492574</td>
+<td>0.0030523490</td>
+<td>Proteobacteria</td>
+<td>Gammaproteobacteria</td>
+<td>Enterobacteriaceae</td>
+<td>Klebsiella</td>
+</tr>
+<tr class="even">
+<td>9778</td>
+<td>1.248705</td>
+<td>1.850822</td>
+<td>0.5158122</td>
+<td>0.0071302629</td>
+<td>Proteobacteria</td>
+<td>Gammaproteobacteria</td>
+<td>Enterobacteriaceae</td>
+<td>Salmonella</td>
+</tr>
+</tbody>
+</table>
+<p>As expected from the original study abstract and title, a <em>Fusobacterium</em> OTU was among the most-significantly differentially abundant between the cancerous and healthy samples.</p>
+</div>
+<div id="plot-results" class="section level1">
+<h1><span class="header-section-number">7</span> Plot Results</h1>
+<p>Here is a bar plot showing the log2-fold-change, showing Genus and Phylum. Uses some ggplot2 commands.</p>
+<pre class="r"><code>library("ggplot2")
+theme_set(theme_bw())
+sigtabgen = subset(sigtab, !is.na(Genus))
+# Phylum order
+x = tapply(sigtabgen$log2FoldChange, sigtabgen$Phylum, function(x) max(x))
+x = sort(x, TRUE)
+sigtabgen$Phylum = factor(as.character(sigtabgen$Phylum), levels=names(x))
+# Genus order
+x = tapply(sigtabgen$log2FoldChange, sigtabgen$Genus, function(x) max(x))
+x = sort(x, TRUE)
+sigtabgen$Genus = factor(as.character(sigtabgen$Genus), levels=names(x))
+ggplot(sigtabgen, aes(y=Genus, x=log2FoldChange, color=Phylum)) +
+ geom_vline(xintercept = 0.0, color = "gray", size = 0.5) +
+ geom_point(size=6) +
+ theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))</code></pre>
+<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAA2AAAAKgCAIAAAByb4nzAAAACXBIWXMAAA7DAAAOwwHHb6hkAAAgAElEQVR4nOzdeUATZ94H8CckAUJCAlUQUJFDRQXFi33rwYpFsIq06qp4tdpVt6Ju0Vbd993a1gNrLYqgbqtW2noVj1ZbwVpatSLerFqEolwKKgoKJOFKyDHz/jFdlgEEEiFh4Pv5K06e55lfxjD5Zp6ZCY+maQIAAAAA8B8W5i4AAAAAANoXBEQAAAAAYEFABAAAAAAWBEQAAAAAYEFABAAAAAAWBEQAAAAAYBGYu4DOa/v27Y8fPzZ3FY3T6XQ8Ho/P55u7EGPo9XqOVk7TNFM8j8czdy3G4O6WV6lUGo1GJpOZuxBj0DRN07SFBSe/7WNXYxZm3NX87W9/8/DwMPFKwTgIiGaTl5cXGxtr7ioap1AoBAKBRCIxdyHGq [...]
+</div>
+
+
+
+<!-- dynamically load mathjax for compatibility with self-contained -->
+<script>
+ (function () {
+ var script = document.createElement("script");
+ script.type = "text/javascript";
+ script.src = "https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
+ document.getElementsByTagName("head")[0].appendChild(script);
+ })();
+</script>
+
+</body>
+</html>
diff --git a/inst/extdata/GP_otu_table_rand_short.txt.gz b/inst/extdata/GP_otu_table_rand_short.txt.gz
new file mode 100644
index 0000000..f93f378
Binary files /dev/null and b/inst/extdata/GP_otu_table_rand_short.txt.gz differ
diff --git a/inst/extdata/GP_tree_rand_short.newick.gz b/inst/extdata/GP_tree_rand_short.newick.gz
new file mode 100644
index 0000000..36286d0
Binary files /dev/null and b/inst/extdata/GP_tree_rand_short.newick.gz differ
diff --git a/inst/extdata/biom-refseq.fasta b/inst/extdata/biom-refseq.fasta
new file mode 100644
index 0000000..742ee1e
--- /dev/null
+++ b/inst/extdata/biom-refseq.fasta
@@ -0,0 +1,29 @@
+>GG_OTU_1
+AACGTAGGTCACAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAATCCA
+TGGGCTCAACCCATGAACTGCTTTCAAAACTGTTTTTCTTGAGTAGTGCAGAGGTAGGCGGAATTCCCCGTGTAGCGGTG
+AAATGCGTAGAGATGGGGAGGAACACCAGTGGCGAAGGCGGCCTGCTGGGCTTTAACTGACGCTGAGGCACGAAAGCGTG
+GGTAGCAAACAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATGATTACTAGGTGTGGGGGTCTGACCCCTTCCGT
+GCCGGAGTTAACAC
+>GG_OTU_2
+TACGTAGGGAGCAAGCGTTATCCGGATTTATTGGGTGTAAAGGGTGCGTAGACGGGAGAACAAGTTAGTTGTGAAAGCCC
+TCGGCTTAACTGAGGAACTGCAACTAAAACTATTTTTCTTGAGTGCAGGAGAGGAAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTTCTGGACTGTAACTGACGTTGAGGCACGAAAGTGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACACCGTAAACGATGGATACTAGGTGTAGGAGATGATTTCATCATCT
+GTGCCGAAAGCAAACGCAATAAGTATCCCACCTGGGGAGTACGGCCGCAAGGTTGAAACTCAAAGGATTGACGGGGCCCG
+CACAAGCAGTGGAGTATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAGGGCTTGACATA
+>GG_OTU_3
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCATCACAAGTCAGAAGTGAAAAATC
+CGGGGGCTCCAACCCCGGAACTGCTTTTGAAACTGTGGAGCTGGAGTGCAGGAGAGGTAAGCGGAATTCCTAGTGTAGCG
+GTAGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGCTTACTGGACTGTAACTGACGTTGAGGCTCGAAAGC
+GTGGGGAGC
+>GG_OTU_4
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGCGGTGCGGCAAGTCTGATGTGAAAGCCC
+GGGGCTCAACCCCGGTACTGCATTGGAAACTGTCGTACTAGAGTGTCGGAGGGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGATAACTGACGCTGAGGCTCGAAAGCGTG
+GGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTTGGGAAGCATTGCTTCTCGGT
+GCCGTCGCAAACGCAGTAAGTATTCCACCTGGGGGATACGTTTCGACAAGAATAGAAACTACAAAAGGAATTAGGACGGG
+GACCCGCACAAGCGGTGAGCATGTGGTTAATCGAAGCAACGCGAAGAACCTTA
+>GG_OTU_5
+AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAGACAAGTTGGAAGTGAAACCATG
+GGCTCAACCCATGAATTGCTTTCAAAACTGTTTTTCTTGAGTTAGTGCAGAGGTAGATGGAATTCCCGGTGTAGCGGTGG
+AATGCGTAGATATCGGGA
diff --git a/inst/extdata/biom-tree.phy b/inst/extdata/biom-tree.phy
new file mode 100644
index 0000000..7a5c6b5
--- /dev/null
+++ b/inst/extdata/biom-tree.phy
@@ -0,0 +1 @@
+(((GG_OTU_1:0.00892,GG_OTU_2:0.01408)1.000.2:0.12196,GG_OTU_3:0.16022)0.995.2:0.01869,(GG_OTU_4:0.08976,GG_OTU_5:0.0665)0.766:0.09714)0.764.3;
diff --git a/inst/extdata/esophagus.fn.list.gz b/inst/extdata/esophagus.fn.list.gz
new file mode 100644
index 0000000..644e49b
Binary files /dev/null and b/inst/extdata/esophagus.fn.list.gz differ
diff --git a/inst/extdata/esophagus.fn.shared.gz b/inst/extdata/esophagus.fn.shared.gz
new file mode 100644
index 0000000..8b0a38b
Binary files /dev/null and b/inst/extdata/esophagus.fn.shared.gz differ
diff --git a/inst/extdata/esophagus.good.groups.gz b/inst/extdata/esophagus.good.groups.gz
new file mode 100644
index 0000000..3c8f6b9
Binary files /dev/null and b/inst/extdata/esophagus.good.groups.gz differ
diff --git a/inst/extdata/esophagus.tree.gz b/inst/extdata/esophagus.tree.gz
new file mode 100644
index 0000000..6766f43
Binary files /dev/null and b/inst/extdata/esophagus.tree.gz differ
diff --git a/inst/extdata/gg13-5-73.tree.gz b/inst/extdata/gg13-5-73.tree.gz
new file mode 100644
index 0000000..5a2cb37
Binary files /dev/null and b/inst/extdata/gg13-5-73.tree.gz differ
diff --git a/inst/extdata/gp500-pycogent.py b/inst/extdata/gp500-pycogent.py
new file mode 100644
index 0000000..abf9ce5
--- /dev/null
+++ b/inst/extdata/gp500-pycogent.py
@@ -0,0 +1,19 @@
+# python/pycogent code to create reference UniFrac results for unit testing.
+# Obviously, assumes that pycogent has generated correct values as well.
+#
+from cogent.maths.unifrac.fast_unifrac import fast_unifrac_file
+import numpy
+envs_in = open("inst/extdata/gp500test.env.txt")
+tree_in = open("inst/extdata/gp500test.tree")
+res_uuf = fast_unifrac_file(tree_in, envs_in, weighted=False)
+numpy.savetxt("inst/extdata/gp500-uuf.csv", res_uuf['distance_matrix'][0], delimiter=",")
+# Now wuf-unnormalized
+envs_in = open("inst/extdata/gp500test.env.txt")
+tree_in = open("inst/extdata/gp500test.tree")
+res_wufu = fast_unifrac_file(tree_in, envs_in, weighted=True)
+numpy.savetxt("inst/extdata/gp500-wufu.csv", res_wufu['distance_matrix'][0], delimiter=",")
+# Now wuf (normalized... weighted='correct')
+envs_in = open("inst/extdata/gp500test.env.txt")
+tree_in = open("inst/extdata/gp500test.tree")
+res_wuf = fast_unifrac_file(tree_in, envs_in, weighted='correct')
+numpy.savetxt("inst/extdata/gp500-wuf.csv", res_wuf['distance_matrix'][0], delimiter=",")
\ No newline at end of file
diff --git a/inst/extdata/gp500-uuf.csv b/inst/extdata/gp500-uuf.csv
new file mode 100644
index 0000000..90df677
--- /dev/null
+++ b/inst/extdata/gp500-uuf.csv
@@ -0,0 +1,28 @@
+0.000000000000000000e+00,3.018022459640413713e-01,2.812222992369496355e-01,5.128077007018385647e-01,5.385307194161441657e-01,4.874848720759568099e-01,5.606402519023876430e-01,6.035774477505881919e-01,7.546108748184404691e-01,7.225915707538570487e-01,5.973862572794810255e-01,4.428912687165728057e-01,6.912146243998446016e-01,6.245264342879368602e-01,6.431979313583345848e-01,7.308343614007770661e-01,6.688046492526460440e-01,5.787613772113072708e-01,5.422984841314639759e-01,5.621884261966275 [...]
+3.018022459640413713e-01,0.000000000000000000e+00,2.037846336573840844e-01,5.099683162114644341e-01,5.215500997023562180e-01,5.094701252010024328e-01,5.904464795395936161e-01,6.460217446603697677e-01,7.650595752867355159e-01,7.424889595989494495e-01,6.299867866860129340e-01,4.628273610605441180e-01,7.221485067953961057e-01,6.576565638203557551e-01,6.736229797743815695e-01,7.731373281262485797e-01,7.005237019998449277e-01,6.133697980764845070e-01,5.417545081149834907e-01,5.946894595643467 [...]
+2.812222992369496355e-01,2.037846336573840844e-01,0.000000000000000000e+00,5.064036149898141215e-01,5.117616996238992799e-01,5.151438339386317233e-01,5.731372196379908024e-01,6.561452094867907814e-01,7.822730280374237122e-01,7.367249524439782871e-01,6.104347538541601281e-01,4.497136530060616977e-01,7.170777898704165931e-01,6.480145030860358091e-01,6.671297884686225865e-01,7.743387511316272009e-01,6.895493899435853891e-01,5.896775570603023375e-01,5.138210085249694803e-01,5.746549305869307 [...]
+5.128077007018385647e-01,5.099683162114644341e-01,5.064036149898141215e-01,0.000000000000000000e+00,2.928710261448910224e-01,5.587421714379841031e-01,6.078134485494102179e-01,6.665654159539147194e-01,7.706727085746221118e-01,7.266259854507204352e-01,7.135276553109519693e-01,6.357213367252585456e-01,7.458700772422188763e-01,5.643452901838492686e-01,7.299292238055559157e-01,7.653110179279608571e-01,6.798801683585042888e-01,6.650084385003650622e-01,7.128019550101982738e-01,6.902345340349578 [...]
+5.385307194161441657e-01,5.215500997023562180e-01,5.117616996238992799e-01,2.928710261448910224e-01,0.000000000000000000e+00,5.571637464715962773e-01,5.859088716216909187e-01,6.511411286623238759e-01,7.386450091435147192e-01,7.457494849834764850e-01,6.993507648206084282e-01,6.270365071923081901e-01,7.171394423473174129e-01,6.043817251105616517e-01,7.148191079480974430e-01,7.179598240326570124e-01,6.756010665821584604e-01,6.460851751900309470e-01,6.915038956917816115e-01,6.674808384892415 [...]
+4.874848720759568099e-01,5.094701252010024328e-01,5.151438339386317233e-01,5.587421714379841031e-01,5.571637464715962773e-01,0.000000000000000000e+00,4.477381168766813024e-01,4.903323579582479219e-01,6.733965141015454403e-01,7.083216094838546972e-01,5.689033220777302002e-01,5.424619655114654160e-01,5.846580138438358665e-01,6.420845257286216246e-01,6.246828808419919721e-01,6.469357646377038584e-01,6.399617563938526654e-01,5.077073416906403569e-01,5.798385177843332627e-01,5.461111011533451 [...]
+5.606402519023876430e-01,5.904464795395936161e-01,5.731372196379908024e-01,6.078134485494102179e-01,5.859088716216909187e-01,4.477381168766813024e-01,0.000000000000000000e+00,4.611183623382416608e-01,6.660044070172856934e-01,6.528671722812924294e-01,5.443323231070327228e-01,5.574992859671508150e-01,5.887555939533224425e-01,6.280837269780357168e-01,6.090481307157904478e-01,5.964774360009699183e-01,6.106685942254945765e-01,5.450922166353882048e-01,6.382588589717945737e-01,6.122422237742188 [...]
+6.035774477505881919e-01,6.460217446603697677e-01,6.561452094867907814e-01,6.665654159539147194e-01,6.511411286623238759e-01,4.903323579582479219e-01,4.611183623382416608e-01,0.000000000000000000e+00,6.255802914925525648e-01,6.308776386950452064e-01,5.719848486039182944e-01,5.801414228524433003e-01,5.675063641584306984e-01,6.068595771089477786e-01,6.060832286243188483e-01,6.119834623998299250e-01,5.943089707315152292e-01,5.629225086243283549e-01,6.653926944487023043e-01,5.871559853329260 [...]
+7.546108748184404691e-01,7.650595752867355159e-01,7.822730280374237122e-01,7.706727085746221118e-01,7.386450091435147192e-01,6.733965141015454403e-01,6.660044070172856934e-01,6.255802914925525648e-01,0.000000000000000000e+00,7.096835674362145241e-01,6.420637565590719964e-01,7.150353570135556946e-01,5.682094663783543442e-01,7.085286388977660721e-01,6.117100142703253418e-01,5.752129309117588463e-01,6.500626733403602753e-01,6.202943148383497363e-01,7.263621543926573887e-01,7.070148717250914 [...]
+7.225915707538570487e-01,7.424889595989494495e-01,7.367249524439782871e-01,7.266259854507204352e-01,7.457494849834764850e-01,7.083216094838546972e-01,6.528671722812924294e-01,6.308776386950452064e-01,7.096835674362145241e-01,0.000000000000000000e+00,5.447606875003407900e-01,6.638157071841106482e-01,6.847319347319347216e-01,4.966051044466344155e-01,6.451634493745304066e-01,6.600895443176622024e-01,4.406050445260767390e-01,6.253881867165617114e-01,7.198133755092888109e-01,6.904956262227578 [...]
+5.973862572794810255e-01,6.299867866860129340e-01,6.104347538541601281e-01,7.135276553109519693e-01,6.993507648206084282e-01,5.689033220777302002e-01,5.443323231070327228e-01,5.719848486039182944e-01,6.420637565590719964e-01,5.447606875003407900e-01,0.000000000000000000e+00,5.139193697167032671e-01,5.540855820173551383e-01,5.952592776743872438e-01,5.072554969557034887e-01,6.290262196827972607e-01,5.821029862230160390e-01,3.974041084363749698e-01,5.665432190671135793e-01,5.987830348414591 [...]
+4.428912687165728057e-01,4.628273610605441180e-01,4.497136530060616977e-01,6.357213367252585456e-01,6.270365071923081901e-01,5.424619655114654160e-01,5.574992859671508150e-01,5.801414228524433003e-01,7.150353570135556946e-01,6.638157071841106482e-01,5.139193697167032671e-01,0.000000000000000000e+00,6.495147321848812050e-01,6.055435382572758041e-01,5.380306547321428434e-01,7.231046110441912145e-01,6.385415997255674547e-01,5.209031830591230872e-01,5.281234009053332734e-01,5.348262192147712 [...]
+6.912146243998446016e-01,7.221485067953961057e-01,7.170777898704165931e-01,7.458700772422188763e-01,7.171394423473174129e-01,5.846580138438358665e-01,5.887555939533224425e-01,5.675063641584306984e-01,5.682094663783543442e-01,6.847319347319347216e-01,5.540855820173551383e-01,6.495147321848812050e-01,0.000000000000000000e+00,6.325868768631122130e-01,6.056048848430581888e-01,4.700586697392283853e-01,6.409776110324213461e-01,5.259173337911342561e-01,6.916055963743695800e-01,6.788948593243997 [...]
+6.245264342879368602e-01,6.576565638203557551e-01,6.480145030860358091e-01,5.643452901838492686e-01,6.043817251105616517e-01,6.420845257286216246e-01,6.280837269780357168e-01,6.068595771089477786e-01,7.085286388977660721e-01,4.966051044466344155e-01,5.952592776743872438e-01,6.055435382572758041e-01,6.325868768631122130e-01,0.000000000000000000e+00,5.977587611845891491e-01,6.987141259258766723e-01,4.797465379782177575e-01,5.404128981746507687e-01,7.158806317923904761e-01,6.571136899751853 [...]
+6.431979313583345848e-01,6.736229797743815695e-01,6.671297884686225865e-01,7.299292238055559157e-01,7.148191079480974430e-01,6.246828808419919721e-01,6.090481307157904478e-01,6.060832286243188483e-01,6.117100142703253418e-01,6.451634493745304066e-01,5.072554969557034887e-01,5.380306547321428434e-01,6.056048848430581888e-01,5.977587611845891491e-01,0.000000000000000000e+00,6.871584555856489196e-01,6.462002215753450685e-01,4.283538153956676364e-01,5.670629469363366315e-01,6.120707128042669 [...]
+7.308343614007770661e-01,7.731373281262485797e-01,7.743387511316272009e-01,7.653110179279608571e-01,7.179598240326570124e-01,6.469357646377038584e-01,5.964774360009699183e-01,6.119834623998299250e-01,5.752129309117588463e-01,6.600895443176622024e-01,6.290262196827972607e-01,7.231046110441912145e-01,4.700586697392283853e-01,6.987141259258766723e-01,6.871584555856489196e-01,0.000000000000000000e+00,6.699374256535196581e-01,6.085549726968952911e-01,7.649368375804368458e-01,7.682721961077182 [...]
+6.688046492526460440e-01,7.005237019998449277e-01,6.895493899435853891e-01,6.798801683585042888e-01,6.756010665821584604e-01,6.399617563938526654e-01,6.106685942254945765e-01,5.943089707315152292e-01,6.500626733403602753e-01,4.406050445260767390e-01,5.821029862230160390e-01,6.385415997255674547e-01,6.409776110324213461e-01,4.797465379782177575e-01,6.462002215753450685e-01,6.699374256535196581e-01,0.000000000000000000e+00,5.589858681301513865e-01,7.425304626801023655e-01,6.459568402632434 [...]
+5.787613772113072708e-01,6.133697980764845070e-01,5.896775570603023375e-01,6.650084385003650622e-01,6.460851751900309470e-01,5.077073416906403569e-01,5.450922166353882048e-01,5.629225086243283549e-01,6.202943148383497363e-01,6.253881867165617114e-01,3.974041084363749698e-01,5.209031830591230872e-01,5.259173337911342561e-01,5.404128981746507687e-01,4.283538153956676364e-01,6.085549726968952911e-01,5.589858681301513865e-01,0.000000000000000000e+00,5.513306634178093413e-01,5.559491445130610 [...]
+5.422984841314639759e-01,5.417545081149834907e-01,5.138210085249694803e-01,7.128019550101982738e-01,6.915038956917816115e-01,5.798385177843332627e-01,6.382588589717945737e-01,6.653926944487023043e-01,7.263621543926573887e-01,7.198133755092888109e-01,5.665432190671135793e-01,5.281234009053332734e-01,6.916055963743695800e-01,7.158806317923904761e-01,5.670629469363366315e-01,7.649368375804368458e-01,7.425304626801023655e-01,5.513306634178093413e-01,0.000000000000000000e+00,5.124281833514244 [...]
+5.621884261966275664e-01,5.946894595643467119e-01,5.746549305869307345e-01,6.902345340349578251e-01,6.674808384892415569e-01,5.461111011533451576e-01,6.122422237742188855e-01,5.871559853329260115e-01,7.070148717250914761e-01,6.904956262227578412e-01,5.987830348414591164e-01,5.348262192147712479e-01,6.788948593243997465e-01,6.571136899751853777e-01,6.120707128042669476e-01,7.682721961077182415e-01,6.459568402632434037e-01,5.559491445130610998e-01,5.124281833514244822e-01,0.000000000000000 [...]
+5.633702112168907927e-01,5.949918345127924724e-01,5.898596173919404828e-01,7.251749314770090660e-01,7.211957564350403027e-01,5.308861702861047416e-01,6.063635869914036913e-01,6.287356542490899480e-01,7.019743096075210520e-01,6.798977385391600237e-01,5.402795293374822450e-01,5.463955321578519708e-01,6.702262206568273539e-01,6.698214760284297853e-01,5.808331725931986655e-01,7.393954843836234581e-01,6.653498282084823190e-01,5.133944092579348917e-01,4.450888946757888265e-01,4.121138191195026 [...]
+5.120521481597772873e-01,5.568260690232531562e-01,5.466762275329436482e-01,6.143019545530881143e-01,6.640179992854537971e-01,6.047976742687384277e-01,6.159861453341926030e-01,6.239124645447141626e-01,7.229661798668907124e-01,6.845506923645165998e-01,5.941245214587950185e-01,4.756323437068479265e-01,7.008768483822873652e-01,6.414383834702079845e-01,5.885352563513785995e-01,7.509528227131285760e-01,6.680610511091379378e-01,5.789931973883517546e-01,5.101244671405643638e-01,5.760477225907113 [...]
+6.004525725960624394e-01,5.745018338822702741e-01,5.942328743314939654e-01,3.718703930306079819e-01,4.184782938999260793e-01,5.871109088407440169e-01,5.867189246124799862e-01,6.039747282582453858e-01,7.259601416722224476e-01,6.755007084084423408e-01,7.093689175176294270e-01,6.540465785494269380e-01,7.324845661797745322e-01,5.307146107583130545e-01,7.152490638881390250e-01,7.211040435800759596e-01,6.528743801726650808e-01,6.536238480052116717e-01,7.325867421237337584e-01,7.065232397524694 [...]
+6.107633679272901350e-01,6.126280874035687773e-01,6.069880680870309853e-01,6.975711232661372385e-01,6.940563495947686956e-01,6.012427141243499307e-01,5.934711376119754522e-01,5.889534591014027320e-01,7.236408840697805545e-01,6.812450591186096638e-01,6.378197984000788523e-01,5.830712002399072169e-01,7.070182689183988067e-01,6.634579849478804414e-01,6.434692081951172149e-01,7.479942789675259052e-01,6.792266887464106784e-01,5.942975544530375576e-01,5.741601910354794391e-01,4.845546786922204 [...]
+5.673274165609600850e-01,5.473635573841704183e-01,5.489407052547168320e-01,6.447489428963317781e-01,6.192299658845645816e-01,6.189566633727665579e-01,6.213969612896456107e-01,6.355682835061594593e-01,7.460359736653798812e-01,6.844061201071454104e-01,6.560471372468050788e-01,5.781855690154474869e-01,7.097014167331552326e-01,6.286805985560283139e-01,6.810433102386653781e-01,7.526870175609831071e-01,6.642459422607140951e-01,6.319732315899984076e-01,5.936147494525365609e-01,5.211317589502831 [...]
+5.509755671071145944e-01,5.489655823635466447e-01,5.434588429914726238e-01,6.327924853195081578e-01,6.150206162650020403e-01,5.850240755640849777e-01,5.921751006841329268e-01,6.262284414373728847e-01,7.333768470739665535e-01,6.876186850591183841e-01,6.397689378080396727e-01,5.841054482474659082e-01,6.584994028146794465e-01,6.265252256475604131e-01,6.661084245113000346e-01,7.068914638444390164e-01,6.473251778882369578e-01,5.781561226945253207e-01,6.108131487100765256e-01,5.601465716826863 [...]
+6.850860824014650685e-01,7.207098567737264538e-01,7.160624564713002904e-01,7.608539544978912827e-01,7.305931887550538573e-01,5.731430739019662823e-01,5.187687825468555447e-01,5.189319801426688361e-01,6.157001087159788000e-01,6.784103521011575211e-01,5.760786681452225722e-01,6.613964488553241150e-01,4.855001881151509435e-01,6.735027278723462185e-01,5.880498322951603907e-01,5.105547717844965971e-01,6.555357956466144609e-01,5.641049492916649921e-01,6.889162431243973650e-01,6.520069250025337 [...]
+6.351633762461322830e-01,7.124885415429300650e-01,7.013103493535752797e-01,7.304280369517197613e-01,7.098966652240308317e-01,6.011448776749324097e-01,5.637069332077186967e-01,5.190680923014974191e-01,6.206160518562786610e-01,6.370931879247478946e-01,5.687360477303478046e-01,6.318322197115266992e-01,5.785992610433664529e-01,6.622490212615965710e-01,6.335834745420800829e-01,5.421020445288501355e-01,5.956873441881176401e-01,5.395876074388995525e-01,6.557962776312856734e-01,6.244069090112995 [...]
diff --git a/inst/extdata/gp500-wuf.csv b/inst/extdata/gp500-wuf.csv
new file mode 100644
index 0000000..6231b4a
--- /dev/null
+++ b/inst/extdata/gp500-wuf.csv
@@ -0,0 +1,28 @@
+0.000000000000000000e+00,2.162223989832920135e-01,2.116262092597404609e-01,3.866234256280975390e-01,4.121144120842151759e-01,5.089512413156394022e-01,5.194198507399426790e-01,5.193823887379401816e-01,5.561588477743008019e-01,4.163512597753724065e-01,4.306243018448085436e-01,4.021764597077106540e-01,5.784421980040588274e-01,3.894690240514162016e-01,3.596343047726121567e-01,5.957241700716232602e-01,3.948106373579565731e-01,4.299216154496899867e-01,4.756344809638139437e-01,5.059288076680952 [...]
+2.162223989832920135e-01,0.000000000000000000e+00,6.033516257592930893e-02,3.919813536632322237e-01,4.118317947799772782e-01,4.686157344859071539e-01,4.748457331330334097e-01,5.236635025580955727e-01,5.587277843297561164e-01,4.493556863578908600e-01,4.624274030387743317e-01,4.733261369965310883e-01,5.908681961098961599e-01,4.260558983038921221e-01,4.346942546599214530e-01,6.084765378064718178e-01,4.068332975756510650e-01,4.620069849083962010e-01,4.994679882027984741e-01,5.304523728237284 [...]
+2.116262092597404609e-01,6.033516257592930893e-02,0.000000000000000000e+00,3.846952581247909952e-01,4.098351090292874921e-01,4.671432657029127022e-01,4.731440849186817066e-01,5.242977698645472362e-01,5.606719165234154145e-01,4.278523166862108762e-01,4.370683810650250467e-01,4.282644509393823129e-01,5.917902155132885866e-01,4.054299366205006416e-01,3.946725236888753985e-01,6.092712326388293187e-01,3.836311211344253791e-01,4.373693824095087335e-01,5.009002366112027538e-01,5.318469579383493 [...]
+3.866234256280975390e-01,3.919813536632322237e-01,3.846952581247909952e-01,0.000000000000000000e+00,2.133426265490963036e-01,5.507677110080698712e-01,5.571948013872931593e-01,5.495809397183640810e-01,6.004474078601035325e-01,3.503243491678726396e-01,4.529903043932035911e-01,4.693561709058894360e-01,6.107611565993079639e-01,3.453991354905851519e-01,4.432350656829630031e-01,6.249230459925192438e-01,4.317497180552467007e-01,4.498975543118769460e-01,4.708637784029361395e-01,4.937192405136009 [...]
+4.121144120842151759e-01,4.118317947799772782e-01,4.098351090292874921e-01,2.133426265490963036e-01,0.000000000000000000e+00,5.537747475870715697e-01,5.603887939105818283e-01,5.551369055823253129e-01,6.015880628918834905e-01,4.105977431029302904e-01,4.577037949762857716e-01,4.740604971335106166e-01,6.077914618632139776e-01,3.724395051525436484e-01,4.584005027727990900e-01,6.139944302629859463e-01,4.519206824054669003e-01,4.545545134226958162e-01,5.114658226806979568e-01,5.251616621830111 [...]
+5.089512413156394022e-01,4.686157344859071539e-01,4.671432657029127022e-01,5.507677110080698712e-01,5.537747475870715697e-01,0.000000000000000000e+00,2.491573026826307546e-02,1.611703256860722022e-01,4.050902007853898201e-01,5.271309824807074440e-01,5.421678466084556280e-01,5.728485530276049564e-01,3.893487319093435484e-01,5.228824546902469672e-01,5.026606664974099248e-01,4.646809225665177734e-01,4.812322457792746477e-01,5.416910441865054882e-01,5.965552184665069424e-01,6.220822813606143 [...]
+5.194198507399426790e-01,4.748457331330334097e-01,4.731440849186817066e-01,5.571948013872931593e-01,5.603887939105818283e-01,2.491573026826307546e-02,0.000000000000000000e+00,1.479709166253703467e-01,4.053369941033830859e-01,5.280562204885590472e-01,5.396837981033654197e-01,5.732633326808006524e-01,3.924753398755988498e-01,5.239940820637141172e-01,5.063921346492437081e-01,4.645705186694781830e-01,4.810056984041189421e-01,5.370408719725375812e-01,5.975319702985016912e-01,6.222342132950898 [...]
+5.193823887379401816e-01,5.236635025580955727e-01,5.242977698645472362e-01,5.495809397183640810e-01,5.551369055823253129e-01,1.611703256860722022e-01,1.479709166253703467e-01,0.000000000000000000e+00,3.492177108646368211e-01,5.191454871663023773e-01,5.360208730911749653e-01,5.708640709277446046e-01,3.157052984992420042e-01,5.157665304394571937e-01,4.996509454451424359e-01,4.020760395638746654e-01,5.006102991025388471e-01,5.298089652036237496e-01,5.981875537152626654e-01,6.234539483243525 [...]
+5.561588477743008019e-01,5.587277843297561164e-01,5.606719165234154145e-01,6.004474078601035325e-01,6.015880628918834905e-01,4.050902007853898201e-01,4.053369941033830859e-01,3.492177108646368211e-01,0.000000000000000000e+00,5.787763245912556087e-01,5.947402364918725759e-01,6.045483674623932657e-01,9.165803310131236115e-02,5.688042158876855936e-01,5.603521840083333094e-01,1.029099570840720096e-01,5.383186953957957988e-01,5.938386087336426922e-01,6.277766489453671106e-01,6.488662431385038 [...]
+4.163512597753724065e-01,4.493556863578908600e-01,4.278523166862108762e-01,3.503243491678726396e-01,4.105977431029302904e-01,5.271309824807074440e-01,5.280562204885590472e-01,5.191454871663023773e-01,5.787763245912556087e-01,0.000000000000000000e+00,2.901890433259129987e-01,3.681839260303323424e-01,6.022244990734756387e-01,1.811124561164345614e-01,2.718966622659546029e-01,6.197659324427388094e-01,2.206660352605798181e-01,2.886992245763274068e-01,4.873186889584547599e-01,5.141330669042245 [...]
+4.306243018448085436e-01,4.624274030387743317e-01,4.370683810650250467e-01,4.529903043932035911e-01,4.577037949762857716e-01,5.421678466084556280e-01,5.396837981033654197e-01,5.360208730911749653e-01,5.947402364918725759e-01,2.901890433259129987e-01,0.000000000000000000e+00,2.224186665906223437e-01,6.005182607819812279e-01,2.738747257345078290e-01,1.644340288579437259e-01,6.033083100497889095e-01,1.619044518443580327e-01,1.039537990206369777e-02,4.998083232290239097e-01,5.131082529680676 [...]
+4.021764597077106540e-01,4.733261369965310883e-01,4.282644509393823129e-01,4.693561709058894360e-01,4.740604971335106166e-01,5.728485530276049564e-01,5.732633326808006524e-01,5.708640709277446046e-01,6.045483674623932657e-01,3.681839260303323424e-01,2.224186665906223437e-01,0.000000000000000000e+00,6.099221393281807346e-01,3.060211392349294623e-01,1.604874890789679820e-01,6.128193453951833458e-01,2.768733117754587636e-01,2.298326394117929661e-01,5.114664732751669218e-01,5.242850335826516 [...]
+5.784421980040588274e-01,5.908681961098961599e-01,5.917902155132885866e-01,6.107611565993079639e-01,6.077914618632139776e-01,3.893487319093435484e-01,3.924753398755988498e-01,3.157052984992420042e-01,9.165803310131236115e-02,6.022244990734756387e-01,6.005182607819812279e-01,6.099221393281807346e-01,0.000000000000000000e+00,5.784155650022274608e-01,5.844042108607071517e-01,1.055100494373465425e-01,5.827374475409642374e-01,5.995011348821055464e-01,6.420741937457417059e-01,6.542406746383476 [...]
+3.894690240514162016e-01,4.260558983038921221e-01,4.054299366205006416e-01,3.453991354905851519e-01,3.724395051525436484e-01,5.228824546902469672e-01,5.239940820637141172e-01,5.157665304394571937e-01,5.688042158876855936e-01,1.811124561164345614e-01,2.738747257345078290e-01,3.060211392349294623e-01,5.784155650022274608e-01,0.000000000000000000e+00,2.702667347023501865e-01,5.882224117111736206e-01,2.060993838196036876e-01,2.711665252884237209e-01,4.746113290797831508e-01,4.899294434122138 [...]
+3.596343047726121567e-01,4.346942546599214530e-01,3.946725236888753985e-01,4.432350656829630031e-01,4.584005027727990900e-01,5.026606664974099248e-01,5.063921346492437081e-01,4.996509454451424359e-01,5.603521840083333094e-01,2.718966622659546029e-01,1.644340288579437259e-01,1.604874890789679820e-01,5.844042108607071517e-01,2.702667347023501865e-01,0.000000000000000000e+00,6.038245177290455512e-01,1.627595460444988262e-01,1.644153590675275223e-01,4.839358286711783474e-01,5.106177101745980 [...]
+5.957241700716232602e-01,6.084765378064718178e-01,6.092712326388293187e-01,6.249230459925192438e-01,6.139944302629859463e-01,4.646809225665177734e-01,4.645705186694781830e-01,4.020760395638746654e-01,1.029099570840720096e-01,6.197659324427388094e-01,6.033083100497889095e-01,6.128193453951833458e-01,1.055100494373465425e-01,5.882224117111736206e-01,6.038245177290455512e-01,0.000000000000000000e+00,6.022530944923193408e-01,6.023691076950261625e-01,6.561916932802045466e-01,6.638803843700791 [...]
+3.948106373579565731e-01,4.068332975756510650e-01,3.836311211344253791e-01,4.317497180552467007e-01,4.519206824054669003e-01,4.812322457792746477e-01,4.810056984041189421e-01,5.006102991025388471e-01,5.383186953957957988e-01,2.206660352605798181e-01,1.619044518443580327e-01,2.768733117754587636e-01,5.827374475409642374e-01,2.060993838196036876e-01,1.627595460444988262e-01,6.022530944923193408e-01,0.000000000000000000e+00,1.623620448297704022e-01,4.851833186120469432e-01,5.130926275274411 [...]
+4.299216154496899867e-01,4.620069849083962010e-01,4.373693824095087335e-01,4.498975543118769460e-01,4.545545134226958162e-01,5.416910441865054882e-01,5.370408719725375812e-01,5.298089652036237496e-01,5.938386087336426922e-01,2.886992245763274068e-01,1.039537990206369777e-02,2.298326394117929661e-01,5.995011348821055464e-01,2.711665252884237209e-01,1.644153590675275223e-01,6.023691076950261625e-01,1.623620448297704022e-01,0.000000000000000000e+00,4.985748244285860276e-01,5.111586941186136 [...]
+4.756344809638139437e-01,4.994679882027984741e-01,5.009002366112027538e-01,4.708637784029361395e-01,5.114658226806979568e-01,5.965552184665069424e-01,5.975319702985016912e-01,5.981875537152626654e-01,6.277766489453671106e-01,4.873186889584547599e-01,4.998083232290239097e-01,5.114664732751669218e-01,6.420741937457417059e-01,4.746113290797831508e-01,4.839358286711783474e-01,6.561916932802045466e-01,4.851833186120469432e-01,4.985748244285860276e-01,0.000000000000000000e+00,5.947296668588187 [...]
+5.059288076680952262e-01,5.304523728237284796e-01,5.318469579383493517e-01,4.937192405136009987e-01,5.251616621830111908e-01,6.220822813606143864e-01,6.222342132950898774e-01,6.234539483243525249e-01,6.488662431385038287e-01,5.141330669042245560e-01,5.131082529680676130e-01,5.242850335826516606e-01,6.542406746383476746e-01,4.899294434122138386e-01,5.106177101745980051e-01,6.638803843700791241e-01,5.130926275274411141e-01,5.111586941186136990e-01,5.947296668588187424e-02,0.000000000000000 [...]
+4.583925070947494351e-01,4.309918950097769708e-01,4.320315402844028818e-01,4.771997652131045875e-01,5.045215088234152878e-01,5.062399003252801011e-01,5.083181423550463540e-01,5.880028377735962986e-01,6.081438589727312660e-01,4.924309245937896762e-01,5.068606316050663363e-01,5.178256541273724700e-01,6.350540670986624114e-01,4.800201816138601751e-01,4.901476212430276314e-01,6.504929419133462787e-01,4.655700007033418686e-01,5.046943027349041211e-01,3.462297855910126398e-01,3.836893584804654 [...]
+4.012101569573018334e-01,4.716832712631371383e-01,4.268945695283685171e-01,4.656661667603181898e-01,4.719760250893897946e-01,5.730701343649712287e-01,5.735357291258064150e-01,5.708062597429177654e-01,6.042272822876816329e-01,3.686253449014643135e-01,2.238016973172938795e-01,5.323303717994434456e-03,6.098195173041641715e-01,3.055252888458825922e-01,1.602643062655509376e-01,6.124331432829944744e-01,2.773141537953040525e-01,2.308649271160685412e-01,5.099111683446555521e-01,5.222180557587547 [...]
+5.472136480575724438e-01,5.725502356299151385e-01,5.487401483224852994e-01,4.355100908682271177e-01,4.856982304044017429e-01,6.735433470142013768e-01,6.787815062921317422e-01,6.744368573689681901e-01,7.217869043655141814e-01,5.110003547197520257e-01,5.794207921538752082e-01,5.903062770946920779e-01,7.281429348808831081e-01,4.878422749415917692e-01,5.808811306668881791e-01,7.318658779312794405e-01,5.732920959066124533e-01,5.761681084352927584e-01,6.526094673448205041e-01,6.634256805953907 [...]
+3.899714195345493906e-01,4.375267064567859854e-01,4.174073061844430499e-01,4.053251750401506981e-01,4.454623839159053689e-01,5.641903898424096564e-01,5.695112426461662603e-01,5.659567977103004965e-01,6.219262885432305099e-01,3.871961461137835014e-01,4.674394661417954899e-01,4.779479449505297795e-01,6.325949678647790897e-01,3.836536769353438014e-01,4.393347785120860771e-01,6.464283074126283557e-01,4.531346084911335725e-01,4.640566046186616056e-01,4.889089132353163225e-01,5.141886373088420 [...]
+5.360867940489236760e-01,5.304850195279224856e-01,5.275350376316415568e-01,4.883285272756597850e-01,5.128633493917508224e-01,6.311925150077088986e-01,6.352972682550631633e-01,6.307206608173033802e-01,6.741837537585156914e-01,4.473358399763122106e-01,5.332765179318331716e-01,5.527212220441330981e-01,6.838021135517843296e-01,4.647728371761802313e-01,5.410934831829915526e-01,6.880538537478639860e-01,5.350265807796883433e-01,5.292847271569754986e-01,5.708202987999934086e-01,5.854900219495362 [...]
+6.487705757391363237e-01,6.672748433651525257e-01,6.692029259807907859e-01,6.542296632320393490e-01,6.592110164802822414e-01,6.574230323838143875e-01,6.690373728427478905e-01,6.622771832007654513e-01,6.429060165962575679e-01,6.608662081392998600e-01,7.031555446589029534e-01,7.178834098968616129e-01,6.499101289948663318e-01,6.459119711937502117e-01,6.895772809086928579e-01,6.568253032974652461e-01,6.940727920501230663e-01,6.981519823113071510e-01,7.285522166179710934e-01,7.391634870553134 [...]
+5.998625171971277981e-01,6.123788794455703455e-01,6.132930971190980740e-01,6.285474272976585564e-01,6.179095467172315281e-01,4.481864791344000731e-01,4.480340246396182224e-01,3.850968752884939006e-01,1.017098877913095306e-01,6.261669646893208707e-01,6.051256961481351615e-01,6.143123567512647210e-01,8.359188239877084203e-02,5.974030109637227248e-01,6.098586618910816215e-01,3.651464225757471843e-02,6.085419081925419471e-01,6.041252890336855641e-01,6.593472880063879771e-01,6.664441836948572 [...]
+5.997454726533050229e-01,6.122348794571184083e-01,6.131593247578512873e-01,6.284117430777713675e-01,6.178961996529648282e-01,4.596215520617095152e-01,4.594825994476301467e-01,3.970939470278556649e-01,1.067253327004019092e-01,6.261799106522016878e-01,6.051900883305931789e-01,6.142021970651400098e-01,9.876684181734898815e-02,5.975297440441617036e-01,6.097634072542180705e-01,3.782615487455098791e-02,6.086067010639361063e-01,6.041885105749894569e-01,6.590990957619877744e-01,6.666271920465147 [...]
diff --git a/inst/extdata/gp500-wufu.csv b/inst/extdata/gp500-wufu.csv
new file mode 100644
index 0000000..80638c4
--- /dev/null
+++ b/inst/extdata/gp500-wufu.csv
@@ -0,0 +1,28 @@
+0.000000000000000000e+00,2.571980495645824760e-01,2.520346115460042058e-01,4.744382427616021247e-01,4.989614981132786120e-01,6.689787969026476100e-01,6.812281507216892962e-01,6.555567542568571104e-01,7.310991860267560538e-01,5.078905334089052204e-01,4.968848730883969611e-01,4.757622409158179799e-01,7.580047110432343027e-01,4.513879975788970267e-01,4.329735351137122268e-01,7.791212809657358518e-01,4.640097693837200166e-01,4.973067140562989485e-01,6.461782576479819795e-01,7.027059209274459 [...]
+2.571980495645824760e-01,0.000000000000000000e+00,7.134612988694838398e-02,4.777025955265382096e-01,4.951411376847378309e-01,6.120029911064691630e-01,6.187580426361958263e-01,6.565376359727377320e-01,7.297573669951301634e-01,5.443562366893041071e-01,5.296760494668689701e-01,5.559325507001195987e-01,7.692977771088824746e-01,4.901932473021416992e-01,5.196689793315965034e-01,7.906605608280053987e-01,4.747036915887559161e-01,5.305191359028126330e-01,6.743391911823016782e-01,7.322877247763698 [...]
+2.520346115460042058e-01,7.134612988694838398e-02,0.000000000000000000e+00,4.693753049158582713e-01,4.933288074623221608e-01,6.107504985467897551e-01,6.172198092670282410e-01,6.580854026600714279e-01,7.331013795263987554e-01,5.189208571519215996e-01,5.012565383133873453e-01,5.036212703902255283e-01,7.713476639324429174e-01,4.670442483906813713e-01,4.723902373484317185e-01,7.925677242739211925e-01,4.481814397512652870e-01,5.028557491817996494e-01,6.769918677505067928e-01,7.349763414173465 [...]
+4.744382427616021247e-01,4.777025955265382096e-01,4.693753049158582713e-01,0.000000000000000000e+00,2.645268192027018794e-01,7.400148992213936294e-01,7.470296402057268192e-01,7.097097307186355231e-01,8.068398585687770463e-01,4.375693916481014356e-01,5.359106447067363277e-01,5.689295976109136621e-01,8.181783158971304148e-01,4.103905163090193486e-01,5.465564006349791226e-01,8.355444999470627021e-01,5.200217146612724495e-01,5.335416879774483867e-01,6.534368094724417597e-01,7.001543010318193 [...]
+4.989614981132786120e-01,4.951411376847378309e-01,4.933288074623221608e-01,2.645268192027018794e-01,0.000000000000000000e+00,7.349747565362398927e-01,7.421229496131191983e-01,7.077817606609246859e-01,7.985081669022223716e-01,5.061206194338655484e-01,5.339818340585343881e-01,5.668586253668712605e-01,8.042339554822514058e-01,4.364118808149922701e-01,5.577404773522754988e-01,8.108646928555500732e-01,5.369063854561715310e-01,5.316109896625814679e-01,7.013952539990288004e-01,7.361322646576331 [...]
+6.689787969026476100e-01,6.120029911064691630e-01,6.107504985467897551e-01,7.400148992213936294e-01,7.349747565362398927e-01,0.000000000000000000e+00,3.557943486575973480e-02,2.221988673322194685e-01,5.796940108802640657e-01,7.044231482903254848e-01,6.887399116765321461e-01,7.443836894691397976e-01,5.555608450075403271e-01,6.669138478690739280e-01,6.637135108780192194e-01,6.618586260527288267e-01,6.216294403139074332e-01,6.896873186372546316e-01,8.799392747988969976e-01,9.364924893381815 [...]
+6.812281507216892962e-01,6.187580426361958263e-01,6.172198092670282410e-01,7.470296402057268192e-01,7.421229496131191983e-01,3.557943486575973480e-02,0.000000000000000000e+00,2.035709779514239814e-01,5.788681563869136903e-01,7.041235947958145447e-01,6.840145121386437266e-01,7.432551951207598817e-01,5.588805844877926532e-01,6.668075153312627013e-01,6.671675787736672802e-01,6.603500575772333070e-01,6.199376761145461989e-01,6.822045507296837785e-01,8.796419491559629433e-01,9.349112896782281 [...]
+6.555567542568571104e-01,6.565376359727377320e-01,6.580854026600714279e-01,7.097097307186355231e-01,7.077817606609246859e-01,2.221988673322194685e-01,2.035709779514239814e-01,0.000000000000000000e+00,4.814956523471237704e-01,6.666312188230212321e-01,6.529289131447337802e-01,7.119825036446741917e-01,4.339864566202507601e-01,6.308936763942432080e-01,6.336372466982959084e-01,5.516838373092527448e-01,6.205086037225401308e-01,6.468812088195157184e-01,8.510971552929724737e-01,9.059876090631582 [...]
+7.310991860267560538e-01,7.297573669951301634e-01,7.331013795263987554e-01,8.068398585687770463e-01,7.985081669022223716e-01,5.796940108802640657e-01,5.788681563869136903e-01,4.814956523471237704e-01,0.000000000000000000e+00,7.735108885381684063e-01,7.555992538282152449e-01,7.856512959612996783e-01,1.307980953536604374e-01,7.255561172979457796e-01,7.399594354798568752e-01,1.465905173026367525e-01,6.954378374184141398e-01,7.561563886120427069e-01,9.260703744956471084e-01,9.768945488504321 [...]
+5.078905334089052204e-01,5.443562366893041071e-01,5.189208571519215996e-01,4.375693916481014356e-01,5.061206194338655484e-01,7.044231482903254848e-01,7.041235947958145447e-01,6.666312188230212321e-01,7.735108885381684063e-01,0.000000000000000000e+00,3.411981909561120352e-01,4.436163320464514559e-01,8.023631956533864296e-01,2.138740921320866351e-01,3.333005299748383332e-01,8.241423165830282782e-01,2.641769146671203439e-01,3.402742377199050594e-01,6.727281764105856876e-01,7.253648293478399 [...]
+4.968848730883969611e-01,5.296760494668689701e-01,5.012565383133873453e-01,5.359106447067363277e-01,5.339818340585343881e-01,6.887399116765321461e-01,6.840145121386437266e-01,6.529289131447337802e-01,7.555992538282152449e-01,3.411981909561120352e-01,0.000000000000000000e+00,2.533097796914561273e-01,7.604618787404671121e-01,3.053433127474357578e-01,1.907180813065976754e-01,7.624454106771454409e-01,1.831447036541874462e-01,1.156648643687512197e-02,6.569875066739947478e-01,6.900590930022961 [...]
+4.757622409158179799e-01,5.559325507001195987e-01,5.036212703902255283e-01,5.689295976109136621e-01,5.668586253668712605e-01,7.443836894691397976e-01,7.432551951207598817e-01,7.119825036446741917e-01,7.856512959612996783e-01,4.436163320464514559e-01,2.533097796914561273e-01,0.000000000000000000e+00,7.901178889548705619e-01,3.500879786460276422e-01,1.908105599300548849e-01,7.922969912920321267e-01,3.212527761562514184e-01,2.624124236686864453e-01,6.871945199736509968e-01,7.203459197988155 [...]
+7.580047110432343027e-01,7.692977771088824746e-01,7.713476639324429174e-01,8.181783158971304148e-01,8.042339554822514058e-01,5.555608450075403271e-01,5.588805844877926532e-01,4.339864566202507601e-01,1.307980953536604374e-01,8.023631956533864296e-01,7.604618787404671121e-01,7.901178889548705619e-01,0.000000000000000000e+00,7.354292100546407918e-01,7.693090744183115337e-01,1.498588179870597448e-01,7.504162947482816426e-01,7.608927093991846302e-01,9.445118582328251300e-01,9.822860983249912 [...]
+4.513879975788970267e-01,4.901932473021416992e-01,4.670442483906813713e-01,4.103905163090193486e-01,4.364118808149922701e-01,6.669138478690739280e-01,6.668075153312627013e-01,6.308936763942432080e-01,7.255561172979457796e-01,2.138740921320866351e-01,3.053433127474357578e-01,3.500879786460276422e-01,7.354292100546407918e-01,0.000000000000000000e+00,3.148493074258849611e-01,7.463872941986602338e-01,2.341911709763438987e-01,3.031014118012900216e-01,6.262928488423964435e-01,6.613913919939808 [...]
+4.329735351137122268e-01,5.196689793315965034e-01,4.723902373484317185e-01,5.465564006349791226e-01,5.577404773522754988e-01,6.637135108780192194e-01,6.671675787736672802e-01,6.336372466982959084e-01,7.399594354798568752e-01,3.333005299748383332e-01,1.907180813065976754e-01,1.908105599300548849e-01,7.693090744183115337e-01,3.148493074258849611e-01,0.000000000000000000e+00,7.933229931670241486e-01,1.922591160428399892e-01,1.911678309695745426e-01,6.603474772368490564e-01,7.122692860366849 [...]
+7.791212809657358518e-01,7.906605608280053987e-01,7.925677242739211925e-01,8.355444999470627021e-01,8.108646928555500732e-01,6.618586260527288267e-01,6.603500575772333070e-01,5.516838373092527448e-01,1.465905173026367525e-01,8.241423165830282782e-01,7.624454106771454409e-01,7.922969912920321267e-01,1.498588179870597448e-01,7.463872941986602338e-01,7.933229931670241486e-01,0.000000000000000000e+00,7.740005240212891602e-01,7.629855545424351071e-01,9.635936956808555109e-01,9.950540846248455 [...]
+4.640097693837200166e-01,4.747036915887559161e-01,4.481814397512652870e-01,5.200217146612724495e-01,5.369063854561715310e-01,6.216294403139074332e-01,6.199376761145461989e-01,6.205086037225401308e-01,6.954378374184141398e-01,2.641769146671203439e-01,1.831447036541874462e-01,3.212527761562514184e-01,7.504162947482816426e-01,2.341911709763438987e-01,1.922591160428399892e-01,7.740005240212891602e-01,0.000000000000000000e+00,1.841278448564223347e-01,6.481465328626381250e-01,7.010186455961859 [...]
+4.973067140562989485e-01,5.305191359028126330e-01,5.028557491817996494e-01,5.335416879774483867e-01,5.316109896625814679e-01,6.896873186372546316e-01,6.822045507296837785e-01,6.468812088195157184e-01,7.561563886120427069e-01,3.402742377199050594e-01,1.156648643687512197e-02,2.624124236686864453e-01,7.608927093991846302e-01,3.031014118012900216e-01,1.911678309695745426e-01,7.629855545424351071e-01,1.841278448564223347e-01,0.000000000000000000e+00,6.567955880345697217e-01,6.889027774383713 [...]
+6.461782576479819795e-01,6.743391911823016782e-01,6.769918677505067928e-01,6.534368094724417597e-01,7.013952539990288004e-01,8.799392747988969976e-01,8.796419491559629433e-01,8.510971552929724737e-01,9.260703744956471084e-01,6.727281764105856876e-01,6.569875066739947478e-01,6.871945199736509968e-01,9.445118582328251300e-01,6.262928488423964435e-01,6.603474772368490564e-01,9.635936956808555109e-01,6.481465328626381250e-01,6.567955880345697217e-01,0.000000000000000000e+00,9.215635158215965 [...]
+7.027059209274459928e-01,7.322877247763698305e-01,7.349763414173465792e-01,7.001543010318193039e-01,7.361322646576331508e-01,9.364924893381815219e-01,9.349112896782281190e-01,9.059876090631582279e-01,9.768945488504321428e-01,7.253648293478399678e-01,6.900590930022961755e-01,7.203459197988155704e-01,9.822860983249912525e-01,6.613913919939808927e-01,7.122692860366849343e-01,9.950540846248455695e-01,7.010186455961859853e-01,6.889027774383713609e-01,9.215635158215965128e-02,0.000000000000000 [...]
+6.119193166076762891e-01,5.717015674190115693e-01,5.737007593459153565e-01,6.509503081496245125e-01,6.799472242856974269e-01,7.347554956481108235e-01,7.362932839923789219e-01,8.227082354328313363e-01,8.827346538920231023e-01,6.681462423233449632e-01,6.542773382364410750e-01,6.834991171820978817e-01,9.191747355282550158e-01,6.220844502241963392e-01,6.572384404654497692e-01,9.398500792312232122e-01,6.109411762256651279e-01,6.529279834698092921e-01,5.177978328019312571e-01,5.854769755275890 [...]
+4.748730870752717048e-01,5.543015260216846984e-01,5.022805561486918258e-01,5.647515135929351837e-01,5.646648590825015246e-01,7.450343553685653264e-01,7.439713946986189486e-01,7.122717022136850762e-01,7.856164792873150393e-01,4.443815146632308366e-01,2.550265545184279281e-01,6.220909281069654691e-03,7.903709430364236344e-01,3.497141131498656685e-01,1.906466499736508557e-01,7.921853305343439233e-01,3.219398101912840926e-01,2.637371722526354878e-01,6.854276045145138996e-01,7.178365239832332 [...]
+6.247000722398078354e-01,6.487888088392389507e-01,6.225959069860617090e-01,5.098873100437379957e-01,5.606825289889606490e-01,8.473682265993656149e-01,8.519838232101711872e-01,8.132591262976328617e-01,9.081524231641808775e-01,5.945539463134961178e-01,6.359259751868615584e-01,6.650497105471193660e-01,9.131447311857994187e-01,5.379106081391529237e-01,6.666054779423773446e-01,9.159337425547328104e-01,6.414684881270744654e-01,6.340080506072395261e-01,8.498344094613506927e-01,8.840753981184115 [...]
+4.762084528056608201e-01,5.305847680844600900e-01,5.067852827275067806e-01,5.067849076566037780e-01,5.496647530489815470e-01,7.546668742880079384e-01,7.601275390428960943e-01,7.274635344450983698e-01,8.319727018407380159e-01,4.813021864629022373e-01,5.502020283522451871e-01,5.764783880439104946e-01,8.436340199982671484e-01,4.535428095595068210e-01,5.391127284050489799e-01,8.604218968653931654e-01,5.430618421700832421e-01,5.475507343649762548e-01,6.755473364778000755e-01,7.260993960703802 [...]
+6.622012412208675425e-01,6.508013601173471274e-01,6.479395166877526258e-01,6.174573025714377161e-01,6.400702920805142471e-01,8.531977845823562889e-01,8.568983587137867231e-01,8.196105390174047711e-01,9.113942776181924721e-01,5.623718875125248706e-01,6.352221337304596593e-01,6.744670996944061026e-01,9.215749744947382416e-01,5.559986788745134856e-01,6.716185856884346883e-01,9.255378377714674754e-01,6.487567229496442600e-01,6.319847782000498349e-01,7.967840951632592095e-01,8.350491372730355 [...]
+7.901446633526449936e-01,8.070456426645433723e-01,8.103381443575647936e-01,8.158838152479405137e-01,8.112867285329776745e-01,8.772549397631942414e-01,8.908068739118607526e-01,8.491342711737631666e-01,8.579639784750976439e-01,8.193543705615734529e-01,8.253844389068152321e-01,8.635613641021931963e-01,8.646290257641253696e-01,7.614922323730163312e-01,8.439635552301293364e-01,8.721417648675438006e-01,8.295765173461729924e-01,8.215128193039382687e-01,1.004322875733721032e+00,1.041407765600347 [...]
+7.851793020014403712e-01,7.963904348776867925e-01,7.984596643850490061e-01,8.410669624513120723e-01,8.167002312494561211e-01,6.388475119204151120e-01,6.373269858923783504e-01,5.288014220326737735e-01,1.449905486839718760e-01,8.333281503590954653e-01,7.653935010987986676e-01,7.948884769043059739e-01,1.188178178305340643e-01,7.586794424686604810e-01,8.019072543302258049e-01,5.180826100765855563e-02,7.827377511454296499e-01,7.658602544277339774e-01,9.689372582369972786e-01,9.996141484448112 [...]
+7.866513046759215921e-01,7.978622150010939995e-01,7.999470584725264510e-01,8.425882883923833111e-01,8.183569815875804121e-01,6.563926233267275423e-01,6.548576654375714012e-01,5.463514326451299752e-01,1.524294255559964784e-01,8.350422181355225959e-01,7.671149075760402836e-01,7.964103172033403233e-01,1.406552035300040471e-01,7.604595905277961032e-01,8.034343562647324744e-01,5.377158362703571282e-02,7.844703094771621199e-01,7.675776473830252522e-01,9.703585738307963782e-01,1.001695090532452 [...]
diff --git a/inst/extdata/gp500test.env.txt b/inst/extdata/gp500test.env.txt
new file mode 100644
index 0000000..33c640e
--- /dev/null
+++ b/inst/extdata/gp500test.env.txt
@@ -0,0 +1,3093 @@
+153762 NP5 1
+175045 AQC4cm 1
+175045 AQC7cm 2
+175045 Even1 27
+175045 Even2 1
+175045 F21Plmr 2
+175045 F21Tong 1
+175045 LMEpi24M 3
+175045 M11Fcsw 1
+175045 NP2 1
+175045 NP3 6
+175045 NP5 3
+175045 TRRsed1 83
+175045 TRRsed2 1692
+175045 TRRsed3 3030
+175045 TS28 5
+71074 AQC4cm 1
+71074 CC1 341
+71074 CL3 93
+71074 Even1 4
+71074 Even2 3
+71074 Even3 30
+71074 F21Plmr 58
+71074 LMEpi24M 1
+71074 M11Plmr 48
+71074 M31Fcsw 1
+71074 M31Plmr 23
+71074 M31Tong 2
+71074 NP3 1
+71074 SLEpi20M 1
+71074 SV1 11788
+71074 TRRsed1 9
+71074 TRRsed2 118
+71074 TRRsed3 51
+71074 TS29 1
+525569 AQC1cm 1
+525569 AQC4cm 14
+525569 AQC7cm 18
+525569 LMEpi24M 1
+525569 NP2 1
+557121 AQC4cm 1
+560734 AQC1cm 3
+560734 AQC4cm 3
+560734 AQC7cm 2
+560734 CC1 1
+560734 LMEpi24M 20
+560734 M11Plmr 1
+560734 SV1 3
+341901 AQC7cm 1
+341901 CC1 33
+341901 CL3 4
+341901 F21Fcsw 1
+341901 M11Plmr 12
+341901 M31Plmr 1
+341901 SV1 217
+341901 TRRsed2 1
+286030 M11Plmr 7
+275402 NP3 1
+275402 NP5 1
+275402 TRRsed1 2
+275402 TRRsed2 19
+275402 TRRsed3 13
+321069 TRRsed2 5
+321069 TRRsed3 1
+161544 AQC1cm 9
+161544 AQC4cm 8
+161544 AQC7cm 12
+161544 CC1 31
+161544 CL3 7
+161544 Even2 1
+161544 SV1 6
+215003 AQC1cm 16
+215003 AQC4cm 29
+215003 AQC7cm 19
+215003 CC1 182
+215003 CL3 9
+215003 Even1 1
+215003 Even2 3
+215003 LMEpi24M 1
+215003 M11Plmr 3
+215003 M31Plmr 5
+215003 NP5 1
+215003 SV1 105
+215003 TRRsed3 1
+548077 AQC1cm 1
+548077 AQC4cm 3
+548077 AQC7cm 3
+548077 CC1 32
+548077 CL3 24
+548077 SV1 169
+257176 CC1 2
+257176 CL3 1
+100157 CC1 32
+100157 CL3 3
+100157 SV1 4
+100157 TRRsed2 1
+576085 CC1 23
+576085 CL3 30
+576085 F21Plmr 1
+576085 M11Plmr 6
+576085 M31Plmr 2
+576085 SV1 2
+576085 TRRsed1 1
+576085 TRRsed2 9
+576085 TRRsed3 5
+100757 F21Plmr 2
+100757 F21Tong 6
+100757 M11Tong 1
+100757 M31Tong 1
+100757 SV1 3
+91919 M31Plmr 7
+91919 TRRsed2 1
+91919 TRRsed3 1
+89337 AQC1cm 1
+89337 CC1 26
+89337 CL3 288
+89337 Even1 1
+89337 Even3 1
+89337 LMEpi24M 1
+89337 M11Plmr 3
+89337 SV1 200
+89337 TRRsed1 1
+542118 CC1 1
+542118 CL3 4
+542118 Even3 1
+542118 SV1 17
+265094 AQC1cm 1
+265094 AQC7cm 2
+265094 CC1 374
+265094 CL3 61
+265094 Even1 3
+265094 Even2 2
+265094 Even3 28
+265094 F21Plmr 1
+265094 F21Tong 1
+265094 M11Plmr 15
+265094 M11Tong 1
+265094 M31Plmr 14
+265094 M31Tong 1
+265094 NP5 2
+265094 SV1 4361
+265094 TRRsed1 3
+265094 TRRsed2 2
+265094 TRRsed3 1
+265094 TS28 1
+265094 TS29 2
+128051 CC1 16
+128051 CL3 489
+128051 Even1 5
+128051 M31Fcsw 2
+23870 CC1 1
+23870 CL3 3
+23870 Even3 1
+23870 SV1 7
+23870 TRRsed2 1
+23870 TRRsed3 1
+160135 AQC4cm 5
+160135 AQC7cm 24
+160135 CC1 24
+160135 CL3 40
+160135 LMEpi24M 1
+160135 SLEpi20M 1
+160135 SV1 22
+160135 TRRsed2 3
+160135 TRRsed3 2
+574226 CL3 2
+574226 Even2 1
+574226 TRRsed2 1
+329744 AQC1cm 3542
+329744 AQC4cm 695
+329744 AQC7cm 2908
+329744 CC1 23
+329744 CL3 21
+329744 Even1 129
+329744 Even2 66
+329744 Even3 99
+329744 F21Fcsw 27
+329744 F21Plmr 29
+329744 F21Tong 1360
+329744 LMEpi24M 102525
+329744 M11Fcsw 45
+329744 M11Plmr 64
+329744 M11Tong 1474
+329744 M31Fcsw 28
+329744 M31Plmr 15
+329744 M31Tong 296
+329744 NP2 91
+329744 NP3 126
+329744 NP5 120
+329744 SLEpi20M 323914
+329744 SV1 67
+329744 TRRsed1 83
+329744 TRRsed2 51
+329744 TRRsed3 28
+329744 TS28 52
+329744 TS29 48
+160337 AQC1cm 1
+160337 AQC7cm 1
+160337 CC1 25
+160337 CL3 62
+160337 Even3 1
+160337 F21Plmr 1
+160337 LMEpi24M 1
+160337 M11Plmr 2
+160337 SV1 88
+160337 TS29 1
+104310 AQC1cm 1
+104310 AQC4cm 3
+104310 CC1 43
+104310 CL3 46
+104310 F21Plmr 1
+104310 LMEpi24M 1
+104310 M11Plmr 4
+104310 M31Plmr 1
+104310 SV1 28
+253651 CC1 4
+253651 CL3 2
+253651 SV1 20
+76142 AQC1cm 1
+76142 AQC4cm 3
+76142 AQC7cm 2
+76142 CC1 38
+76142 CL3 5
+76142 M11Plmr 7
+76142 M31Plmr 5
+76142 SV1 7
+144921 CC1 1
+144921 LMEpi24M 19
+144921 M11Plmr 2
+144921 SV1 5
+349277 AQC1cm 2
+349277 AQC4cm 4
+349277 AQC7cm 1
+349277 CC1 4
+349277 F21Plmr 2
+349277 LMEpi24M 2
+349277 M11Plmr 5
+349277 M31Plmr 26
+349277 SLEpi20M 1
+152720 CC1 1
+152720 F21Tong 1
+152720 M11Plmr 2
+218035 AQC1cm 4
+218035 AQC4cm 3
+218035 AQC7cm 2
+218035 CC1 452
+218035 CL3 69
+218035 Even2 1
+218035 F21Plmr 39
+218035 F21Tong 1
+218035 LMEpi24M 5
+218035 M11Plmr 242
+218035 M11Tong 1
+218035 M31Plmr 75
+218035 M31Tong 2
+218035 SLEpi20M 8
+218035 SV1 29
+218035 TRRsed2 1
+218035 TRRsed3 1
+29284 AQC1cm 2
+29284 AQC4cm 2
+29284 AQC7cm 3
+29284 CC1 135
+29284 CL3 1
+29284 Even2 3
+29284 F21Plmr 2
+29284 F21Tong 3
+29284 M11Plmr 93
+29284 M31Plmr 4
+29284 M31Tong 1
+29284 SLEpi20M 5
+29284 SV1 2
+29284 TRRsed1 7
+29284 TRRsed2 14
+29284 TRRsed3 2
+588981 F21Tong 1
+588981 M11Plmr 19
+588981 M31Plmr 1
+12364 AQC7cm 1
+12364 CC1 7
+12364 M11Plmr 4
+12364 M31Tong 1
+139920 CC1 1
+139920 M11Plmr 7
+26061 F21Plmr 169
+26061 F21Tong 13
+26061 LMEpi24M 1
+26061 M11Plmr 14
+26061 M31Plmr 190
+26061 M31Tong 13
+26061 TS29 1
+471141 AQC1cm 58
+471141 AQC4cm 23
+471141 AQC7cm 52
+471141 CC1 20
+471141 CL3 8
+471141 Even1 25
+471141 Even2 29
+471141 Even3 11
+471141 F21Fcsw 12
+471141 F21Plmr 782
+471141 F21Tong 95813
+471141 LMEpi24M 491
+471141 M11Fcsw 12
+471141 M11Plmr 306
+471141 M11Tong 1150
+471141 M31Fcsw 15
+471141 M31Plmr 9196
+471141 M31Tong 91809
+471141 NP2 9
+471141 NP3 19
+471141 NP5 138
+471141 SLEpi20M 21
+471141 SV1 13
+471141 TRRsed1 12
+471141 TRRsed2 18
+471141 TRRsed3 10
+471141 TS28 17
+471141 TS29 16
+261709 CC1 1
+320879 AQC1cm 2
+320879 AQC4cm 4
+320879 CC1 1
+320879 CL3 1
+320879 Even1 2
+320879 Even2 1
+320879 Even3 3
+320879 F21Plmr 339
+320879 F21Tong 14
+320879 LMEpi24M 2
+320879 M11Fcsw 5
+320879 M11Plmr 5949
+320879 M11Tong 3
+320879 M31Fcsw 161
+320879 M31Plmr 6385
+320879 M31Tong 8
+320879 NP3 4
+320879 NP5 2
+320879 SLEpi20M 1
+320879 TRRsed2 2
+320879 TRRsed3 3
+320879 TS28 3
+320879 TS29 1
+503327 F21Plmr 3
+503327 M11Plmr 41
+503327 M31Plmr 13
+261774 CC1 2
+261774 Even3 1
+261774 F21Fcsw 3
+261774 F21Plmr 238
+261774 F21Tong 1
+261774 M11Plmr 852
+261774 M11Tong 2
+261774 M31Fcsw 6
+261774 M31Plmr 1030
+261774 M31Tong 1
+261774 SLEpi20M 4
+261774 SV1 1
+261774 TRRsed2 1
+261774 TS29 1
+13333 CC1 13
+13333 CL3 10
+13333 M11Plmr 14
+13333 SLEpi20M 1
+13333 TRRsed2 2
+13069 CC1 127
+13069 CL3 10
+13069 M11Plmr 5
+13069 M31Tong 1
+13069 TRRsed3 1
+203691 Even1 14
+203691 Even2 4
+203691 Even3 4
+203691 TS28 1
+292521 AQC1cm 2
+292521 Even1 74
+292521 Even2 63
+292521 Even3 56
+292521 M31Fcsw 3
+292521 TS28 25
+292521 TS29 141
+201720 F21Plmr 3
+201720 M11Plmr 2
+201720 M31Plmr 6
+72610 F21Plmr 21
+72610 F21Tong 4
+72610 M31Plmr 2
+90465 AQC1cm 3
+90465 CC1 1
+90465 SLEpi20M 1
+10517 F21Fcsw 5
+10517 M31Fcsw 2
+23777 F21Plmr 1
+23777 M31Plmr 7
+576850 AQC4cm 3
+576850 CC1 5
+576850 CL3 41
+576850 F21Fcsw 71
+576850 SV1 10
+531589 F21Fcsw 1
+272602 F21Fcsw 1
+568172 NP2 1
+259270 CL3 9
+259270 SV1 1
+42014 AQC1cm 13
+42014 Even1 1
+42014 F21Plmr 4
+42014 M11Plmr 26
+42014 M31Plmr 5
+42014 M31Tong 1
+42014 NP5 1
+42014 TRRsed2 31
+42014 TRRsed3 1
+264558 F21Plmr 8
+264558 M11Plmr 31
+330861 AQC1cm 5
+330861 AQC7cm 2
+330861 CC1 21
+330861 CL3 1
+330861 Even2 1
+330861 F21Fcsw 1
+330861 F21Plmr 297
+330861 F21Tong 6
+330861 LMEpi24M 3
+330861 M11Fcsw 4
+330861 M11Plmr 402
+330861 M11Tong 1
+330861 M31Fcsw 5
+330861 M31Plmr 179
+330861 M31Tong 8
+330861 NP2 2
+330861 NP3 7
+330861 NP5 9
+330861 SLEpi20M 1
+330861 SV1 2
+330861 TRRsed1 60
+330861 TRRsed2 4618
+330861 TRRsed3 55
+330861 TS29 3
+550329 AQC1cm 52
+550329 AQC4cm 4
+550329 F21Tong 1
+550329 LMEpi24M 1
+550329 SLEpi20M 60
+328963 AQC1cm 97
+328963 AQC4cm 212
+328963 AQC7cm 125
+328963 CC1 4
+328963 Even1 2
+328963 M31Tong 4
+328963 NP2 3
+328963 NP3 14
+328963 NP5 123
+328963 SLEpi20M 6
+328963 TS29 1
+159246 AQC1cm 6
+159246 AQC4cm 3
+159246 AQC7cm 2
+103589 SLEpi20M 4
+136145 AQC1cm 7
+136145 AQC4cm 5
+247816 AQC4cm 3
+247816 LMEpi24M 2
+279083 AQC1cm 72
+279083 AQC4cm 21
+279083 AQC7cm 19
+279083 F21Tong 1
+279083 LMEpi24M 1
+279083 TS28 1
+558480 AQC1cm 51
+558480 AQC4cm 38
+558480 AQC7cm 9
+558480 F21Tong 1
+558480 LMEpi24M 76
+558480 NP2 5
+558480 NP5 16
+558480 SLEpi20M 68
+373357 AQC1cm 1
+373357 AQC4cm 6
+373357 AQC7cm 2
+373357 CC1 4
+373357 CL3 1
+373357 Even3 1
+373357 F21Fcsw 589
+373357 LMEpi24M 7
+373357 M11Fcsw 35
+373357 M11Plmr 1
+373357 M11Tong 1
+373357 M31Plmr 1
+373357 M31Tong 4
+373357 NP3 1
+373357 TRRsed3 2
+306610 TRRsed1 12
+334326 NP3 2
+18643 TRRsed1 1
+18643 TRRsed2 1
+355680 AQC4cm 1
+355680 AQC7cm 2
+355680 CC1 2
+355680 SLEpi20M 2
+355680 TRRsed2 1
+547752 AQC1cm 3
+547752 AQC4cm 3
+547752 AQC7cm 2
+547752 CC1 37
+547752 CL3 163
+547752 Even1 1
+547752 LMEpi24M 13
+547752 M11Plmr 2
+547752 SV1 9
+166835 AQC1cm 10
+166835 AQC4cm 11
+166835 AQC7cm 4
+166835 CC1 3814
+166835 CL3 5129
+166835 Even1 30
+166835 Even2 9
+166835 Even3 3
+166835 F21Fcsw 1
+166835 F21Tong 2
+166835 LMEpi24M 7
+166835 M11Plmr 5
+166835 M11Tong 1
+166835 M31Fcsw 1
+166835 M31Plmr 2
+166835 M31Tong 4
+166835 NP3 1
+166835 NP5 1
+166835 SLEpi20M 1
+166835 SV1 3
+166835 TRRsed3 4
+166835 TS29 1
+295422 CC1 17
+295422 CL3 105
+295422 NP3 1
+295422 SV1 18
+234044 LMEpi24M 16
+207982 CC1 3
+207982 CL3 1
+207982 Even1 1
+207982 M11Plmr 1
+207982 SLEpi20M 1
+207982 SV1 1
+470724 AQC7cm 1
+470724 F21Plmr 491
+470724 F21Tong 34
+470724 M11Plmr 7
+470724 M31Plmr 23
+470724 M31Tong 9
+470724 NP2 1
+394191 SV1 2
+151439 AQC1cm 7
+151439 AQC4cm 7
+151439 AQC7cm 9
+151439 CC1 914
+151439 CL3 18
+151439 Even2 6
+151439 F21Tong 1
+151439 M11Plmr 21
+151439 SV1 5
+101544 AQC1cm 3
+101544 AQC7cm 2
+101544 LMEpi24M 13
+101544 SLEpi20M 13
+356589 AQC7cm 1
+356589 CL3 1
+356589 NP3 1
+107184 AQC1cm 4
+107184 AQC4cm 11
+107184 AQC7cm 11
+107184 CC1 2
+107184 CL3 3
+107184 F21Tong 1
+107184 LMEpi24M 5
+107184 M11Plmr 1
+107184 SLEpi20M 1
+163176 AQC1cm 26
+163176 AQC4cm 15
+163176 AQC7cm 53
+163176 CC1 73
+163176 CL3 10
+163176 Even1 1
+163176 F21Plmr 1
+163176 LMEpi24M 2
+163176 M11Plmr 1
+163176 M31Tong 2
+163176 SLEpi20M 26
+163176 SV1 100
+163176 TRRsed1 1
+6697 AQC1cm 1
+6697 AQC4cm 1
+6697 AQC7cm 4
+6697 LMEpi24M 2
+556231 AQC4cm 1
+556231 AQC7cm 1
+556231 CC1 4
+556231 CL3 5
+556231 F21Plmr 17
+556231 M11Plmr 1
+556231 M31Plmr 1
+556231 M31Tong 2
+556231 NP3 1
+556231 SLEpi20M 2
+556231 SV1 12
+556231 TRRsed3 1
+546756 AQC1cm 2
+546756 CC1 71
+546756 CL3 74
+546756 Even1 2
+555446 AQC7cm 2
+555446 CC1 29
+555446 CL3 227
+555446 Even1 2
+274500 CL3 1
+236550 AQC1cm 1
+236550 AQC4cm 2
+236550 AQC7cm 3
+236550 CC1 342
+236550 CL3 28
+236550 F21Plmr 11
+236550 LMEpi24M 2
+236550 M11Plmr 61
+236550 M11Tong 1
+236550 M31Plmr 9
+236550 M31Tong 1
+236550 SLEpi20M 20
+236550 SV1 382
+236550 TRRsed2 3
+88301 AQC4cm 1
+88301 CC1 20
+88301 CL3 1
+88301 F21Plmr 2
+88301 M11Plmr 16
+88301 M31Plmr 7
+88301 NP2 1
+88301 SLEpi20M 7
+88301 SV1 43
+102382 AQC1cm 5
+102382 AQC4cm 11
+102382 AQC7cm 7
+102382 CC1 977
+102382 CL3 19
+102382 Even1 1
+102382 Even2 5
+102382 Even3 4
+102382 F21Fcsw 1
+102382 F21Plmr 179
+102382 LMEpi24M 5
+102382 M11Plmr 790
+102382 M11Tong 5
+102382 M31Plmr 291
+102382 M31Tong 1
+102382 NP5 1
+102382 SLEpi20M 273
+102382 SV1 2373
+102382 TRRsed1 2
+102382 TS28 1
+322045 AQC4cm 1
+322045 AQC7cm 1
+322045 CC1 46
+322045 CL3 6
+322045 F21Plmr 1
+322045 M11Plmr 16
+322045 M31Plmr 7
+322045 M31Tong 1
+322045 NP5 1
+322045 SLEpi20M 2
+322045 SV1 23
+133568 AQC1cm 19
+133568 AQC4cm 96
+133568 AQC7cm 75
+133568 CC1 27
+133568 CL3 21
+133568 Even2 1
+133568 LMEpi24M 16
+133568 M11Plmr 1
+133568 M31Plmr 1
+133568 NP3 4
+133568 NP5 1
+133568 SLEpi20M 6
+133568 SV1 3
+133568 TRRsed1 2
+133568 TRRsed2 1
+97627 AQC1cm 25
+97627 AQC4cm 51
+97627 AQC7cm 76
+97627 CC1 1255
+97627 CL3 610
+97627 Even1 4
+97627 Even2 3
+97627 Even3 1
+97627 F21Plmr 19
+97627 F21Tong 5
+97627 LMEpi24M 13
+97627 M11Plmr 963
+97627 M11Tong 1
+97627 M31Fcsw 1
+97627 M31Plmr 13
+97627 M31Tong 3
+97627 NP2 3
+97627 NP5 1
+97627 SLEpi20M 5
+97627 SV1 92
+578828 SLEpi20M 2
+239064 AQC1cm 2
+239064 AQC4cm 5
+239064 AQC7cm 1
+239064 CC1 39
+239064 CL3 12
+239064 Even1 3
+239064 Even2 1
+239064 Even3 2
+239064 F21Fcsw 1
+239064 F21Plmr 36
+239064 LMEpi24M 9
+239064 M11Plmr 24
+239064 M31Plmr 47
+239064 NP3 1
+239064 SV1 1411
+239064 TRRsed1 1
+239064 TRRsed2 3
+161340 AQC1cm 9
+161340 AQC4cm 48
+161340 AQC7cm 28
+161340 CC1 272
+161340 CL3 27
+161340 Even2 2
+161340 Even3 1
+161340 LMEpi24M 5
+161340 M11Plmr 2
+161340 M31Plmr 1
+161340 SLEpi20M 3
+161340 SV1 9
+235390 AQC1cm 152
+235390 AQC4cm 184
+235390 AQC7cm 162
+235390 F21Plmr 2
+235390 M11Tong 1
+235390 NP2 1
+235390 NP3 1
+235390 SLEpi20M 2
+235390 TS29 1
+141836 AQC1cm 3
+141836 AQC4cm 8
+141836 AQC7cm 4
+339015 AQC1cm 2
+339015 AQC4cm 2
+339015 AQC7cm 1
+339015 CL3 2
+339015 F21Plmr 1
+339015 LMEpi24M 4
+339015 M11Plmr 47
+339015 M31Plmr 4
+339015 NP3 3
+339015 TRRsed2 1
+143699 CC1 2
+143699 Even3 1
+143699 LMEpi24M 9
+143699 SLEpi20M 7
+114783 AQC4cm 6
+114783 AQC7cm 5
+114783 CC1 5
+114783 Even3 1
+114783 LMEpi24M 8
+114783 SLEpi20M 18
+114783 SV1 2
+164557 AQC1cm 3
+164557 AQC4cm 1
+164557 AQC7cm 1
+164557 CL3 1
+164557 M11Plmr 1
+164557 NP2 42
+164557 NP3 299
+164557 NP5 13
+164557 TRRsed2 1
+140805 AQC4cm 2
+140805 CL3 1
+140805 NP2 11
+140805 NP3 110
+140805 NP5 10
+140805 TRRsed1 9
+140805 TRRsed2 18
+140805 TRRsed3 4
+196433 NP3 2
+196433 TRRsed2 1
+9510 AQC1cm 6
+9510 AQC4cm 2
+9510 AQC7cm 5
+9510 CC1 3
+9510 CL3 2
+9510 Even1 5
+9510 Even2 4
+9510 Even3 2
+9510 F21Fcsw 1
+9510 F21Plmr 75
+9510 F21Tong 4805
+9510 LMEpi24M 52
+9510 M11Fcsw 2
+9510 M11Plmr 13
+9510 M11Tong 119
+9510 M31Fcsw 4
+9510 M31Plmr 378
+9510 M31Tong 5694
+9510 NP2 2
+9510 NP3 8
+9510 NP5 12
+9510 SLEpi20M 6
+9510 TRRsed1 3
+9510 TRRsed2 1
+9510 TRRsed3 1
+9510 TS28 4
+9510 TS29 1
+55799 Even1 1
+55799 Even3 3
+72374 AQC1cm 2
+72374 CC1 1
+72374 Even1 12
+72374 Even2 15
+72374 Even3 23
+72374 F21Plmr 15
+72374 M11Plmr 1
+72374 NP3 1
+72374 SV1 1
+224569 Even1 13
+224569 Even2 4
+224569 Even3 9
+224569 F21Plmr 1
+224569 NP3 1
+10113 AQC1cm 4
+10113 AQC4cm 1
+10113 CC1 4
+10113 CL3 2
+10113 Even1 383
+10113 Even2 326
+10113 Even3 382
+10113 M11Plmr 4
+10113 M31Plmr 4
+10113 NP5 1
+10113 SLEpi20M 6
+10113 TRRsed1 3
+10113 TRRsed2 1
+10113 TRRsed3 4
+10113 TS29 1
+8389 M11Plmr 1
+362373 TRRsed2 2
+139641 NP5 2
+125536 AQC1cm 2
+125536 AQC4cm 6
+125536 AQC7cm 4
+125536 CC1 11
+125536 CL3 5
+125536 F21Plmr 1
+125536 LMEpi24M 2
+125536 NP2 2
+125536 SLEpi20M 1
+125536 TRRsed1 1
+125536 TRRsed2 1
+125536 TRRsed3 1
+103157 CC1 1
+103157 NP5 5
+103157 TRRsed1 7
+245893 TRRsed2 2
+245893 TRRsed3 1
+144746 TRRsed2 1
+278398 AQC1cm 5
+278398 AQC4cm 3
+278398 AQC7cm 1
+278398 Even1 1
+278398 Even2 1
+278398 M31Tong 1
+278398 NP2 1
+278398 NP3 7
+278398 NP5 4
+278398 TRRsed1 8
+278398 TRRsed2 10
+278398 TRRsed3 5
+97508 AQC1cm 4
+97508 AQC4cm 10
+97508 AQC7cm 8
+97508 F21Tong 1
+97508 LMEpi24M 1
+97508 NP2 1
+97508 NP5 1
+97508 TRRsed2 1
+248442 AQC7cm 1
+248442 NP3 2
+248442 NP5 2
+248442 TRRsed2 12
+248442 TRRsed3 1
+269778 AQC1cm 1
+269778 AQC7cm 1
+269778 CC1 4
+269778 Even2 1
+269778 Even3 2
+269778 LMEpi24M 1
+269778 M31Plmr 1
+269778 NP2 1
+269778 NP3 9
+269778 NP5 98
+269778 TRRsed1 2
+269778 TRRsed2 16
+269778 TRRsed3 3
+312030 AQC1cm 26
+312030 AQC4cm 4
+312030 AQC7cm 1
+312030 CL3 1
+312030 Even1 3
+312030 Even3 1
+312030 F21Fcsw 1
+312030 F21Tong 4
+312030 LMEpi24M 3
+312030 M31Tong 12
+312030 NP2 38
+312030 NP3 942
+312030 NP5 206
+312030 TRRsed2 3
+312030 TRRsed3 2
+312030 TS29 1
+573761 AQC4cm 1
+573761 M31Tong 3
+573761 NP2 2
+573761 NP3 10
+573761 TRRsed1 1
+573761 TRRsed2 3
+573761 TRRsed3 1
+583669 AQC4cm 1
+583669 AQC7cm 6
+583669 CC1 13
+583669 CL3 6
+583669 M11Plmr 7
+583669 SLEpi20M 1
+583669 SV1 209
+583669 TRRsed2 10
+139752 TRRsed1 6
+139752 TRRsed2 97
+139752 TRRsed3 11
+315848 AQC4cm 5
+315848 AQC7cm 4
+315848 CL3 1
+315848 Even1 2
+315848 F21Plmr 1
+315848 F21Tong 1
+315848 LMEpi24M 1
+315848 M11Tong 1
+315848 M31Plmr 1
+315848 NP2 4
+315848 NP3 8
+315848 NP5 8
+315848 TRRsed1 55
+315848 TRRsed2 347
+315848 TRRsed3 96
+516971 Even2 1
+516971 NP3 13
+516971 NP5 3
+516971 TRRsed2 1
+267568 AQC7cm 3
+267568 M31Tong 1
+267568 NP2 6
+267568 NP3 6
+267568 NP5 5
+267568 TRRsed1 14
+267568 TRRsed2 13
+267568 TRRsed3 8
+536009 NP3 2
+536009 TRRsed1 5
+536009 TRRsed2 73
+536009 TRRsed3 1
+275935 AQC1cm 1
+275935 CC1 8
+275935 CL3 220
+275935 Even1 3
+275935 Even3 1
+275935 F21Plmr 22
+275935 M11Fcsw 1
+275935 M11Plmr 488
+275935 M31Plmr 53
+275935 NP3 3
+275935 NP5 3
+275935 SLEpi20M 26
+275935 TRRsed3 1
+275935 TS29 1
+558838 M31Plmr 1
+181819 CC1 1
+181819 M31Plmr 1
+181819 NP5 3
+181819 TRRsed2 1
+141423 AQC1cm 43
+141423 AQC4cm 68
+141423 AQC7cm 80
+141423 SLEpi20M 1
+8096 AQC1cm 45
+8096 AQC4cm 99
+8096 AQC7cm 159
+8096 CC1 2
+8096 CL3 1
+8096 LMEpi24M 1
+8096 M11Plmr 2
+8096 NP2 1
+8096 SLEpi20M 1
+8096 SV1 277
+8096 TRRsed1 1
+8096 TRRsed2 25
+8096 TRRsed3 36
+142784 AQC1cm 8
+142784 AQC4cm 20
+142784 AQC7cm 21
+142784 LMEpi24M 1
+343119 F21Plmr 1
+343119 TRRsed1 7
+343119 TRRsed2 15
+343119 TRRsed3 4
+511371 CC1 3
+279470 CC1 9
+546723 NP5 1
+454435 AQC4cm 1
+454435 AQC7cm 4
+454435 CC1 1
+454435 M11Plmr 1
+454435 SV1 1
+257199 CC1 35
+257199 CL3 2
+257199 SV1 2
+238279 CC1 2
+362382 NP2 20
+362382 NP3 10
+362382 NP5 5
+362382 TRRsed1 6
+362382 TRRsed2 12
+362382 TRRsed3 4
+305967 AQC4cm 1
+305967 Even1 4
+305967 Even3 1
+305967 NP3 2
+305967 TRRsed1 2
+305967 TRRsed2 86
+305967 TRRsed3 242
+349780 NP2 1
+349780 NP3 6
+349780 TRRsed1 1
+349780 TRRsed2 7
+349780 TRRsed3 17
+512616 AQC1cm 2
+512616 AQC4cm 4
+512616 AQC7cm 8
+512616 CC1 10
+512616 CL3 35
+512616 Even3 1
+512616 SV1 98
+593006 AQC1cm 7
+593006 AQC4cm 9
+593006 AQC7cm 14
+593006 CC1 1440
+593006 CL3 372
+593006 Even1 2
+593006 Even2 1
+593006 F21Tong 1
+593006 LMEpi24M 9
+593006 M31Fcsw 1
+593006 M31Tong 1
+593006 SV1 3
+593006 TRRsed2 2
+292288 AQC4cm 2
+292288 Even2 1
+292288 NP3 3
+292288 TRRsed1 3
+292288 TRRsed2 48
+292288 TRRsed3 57
+166871 AQC1cm 2
+166871 AQC4cm 7
+166871 AQC7cm 1
+166871 CC1 1
+166871 Even1 6
+166871 Even2 1
+166871 Even3 1
+166871 F21Plmr 1
+166871 F21Tong 2
+166871 LMEpi24M 4
+166871 M11Plmr 1
+166871 M11Tong 1
+166871 M31Tong 3
+166871 NP2 1
+166871 NP3 5
+166871 NP5 1
+166871 SV1 1
+166871 TRRsed1 86
+166871 TRRsed2 1421
+166871 TRRsed3 540
+166871 TS28 3
+565944 AQC1cm 2
+565944 AQC4cm 8
+565944 AQC7cm 15
+565944 Even1 1
+565944 NP3 20
+565944 NP5 2
+565944 TRRsed1 9
+565944 TRRsed2 36
+565944 TRRsed3 76
+510626 AQC4cm 1
+510626 Even1 3
+510626 Even2 2
+510626 Even3 1
+510626 TRRsed2 7
+510626 TRRsed3 1
+351629 CC1 79
+351629 CL3 40
+351629 Even1 2
+351629 Even2 2
+351629 Even3 3
+351629 F21Plmr 8
+351629 LMEpi24M 1
+351629 M11Plmr 20
+351629 SV1 46
+351629 TRRsed2 1
+202238 CC1 1
+202238 M11Plmr 1
+102209 CC1 5
+102209 CL3 1
+102209 LMEpi24M 1
+93610 CC1 2
+93610 CL3 1
+93610 LMEpi24M 6
+93610 M11Tong 1
+93610 SV1 1
+205713 M11Plmr 2
+519193 AQC1cm 1
+519193 F21Plmr 6
+519193 M11Plmr 10
+519193 TRRsed2 1
+240591 AQC1cm 258
+240591 AQC4cm 919
+240591 AQC7cm 1037
+240591 CC1 2144
+240591 CL3 186
+240591 Even1 5
+240591 Even2 6
+240591 F21Fcsw 1
+240591 F21Plmr 2
+240591 LMEpi24M 1
+240591 M11Plmr 29
+240591 M31Plmr 18
+240591 M31Tong 1
+240591 NP2 7
+240591 NP3 1
+240591 NP5 2
+240591 SLEpi20M 3
+240591 SV1 3
+240591 TRRsed2 6
+240591 TRRsed3 2
+223711 TRRsed2 1
+107461 AQC1cm 49
+107461 AQC4cm 30
+107461 AQC7cm 13
+107461 CC1 6
+107461 Even3 2
+107461 F21Tong 3
+107461 LMEpi24M 59
+107461 M11Tong 1
+107461 NP3 1
+107461 SLEpi20M 1
+98457 AQC4cm 4
+98457 AQC7cm 1
+98457 CC1 141
+98457 CL3 48
+98457 Even1 1
+98457 F21Fcsw 1
+98457 M31Plmr 1
+98457 SLEpi20M 4
+108870 CC1 1
+108870 M11Plmr 5
+108870 SV1 26
+114978 AQC1cm 33
+114978 AQC4cm 51
+114978 AQC7cm 29
+114978 CC1 6
+114978 Even1 1
+114978 M11Plmr 2
+114978 SV1 1
+150577 AQC4cm 2
+150577 AQC7cm 1
+150577 TRRsed2 1
+545503 SV1 1
+545503 TRRsed1 1
+545503 TRRsed2 1
+545503 TRRsed3 1
+100870 NP3 1
+100870 TRRsed2 1
+243335 NP2 1
+243335 NP3 3
+243335 NP5 2
+93537 NP2 1
+93537 NP3 43
+93537 NP5 4
+93537 TRRsed1 1
+93537 TRRsed2 4
+265749 NP3 183
+265749 NP5 6
+534609 AQC1cm 1134
+534609 AQC4cm 336
+534609 AQC7cm 110
+534609 CC1 11
+534609 CL3 12
+534609 Even1 47
+534609 Even2 26
+534609 Even3 11
+534609 F21Fcsw 13
+534609 F21Plmr 8
+534609 F21Tong 47
+534609 LMEpi24M 32
+534609 M11Fcsw 10
+534609 M11Plmr 64
+534609 M11Tong 38
+534609 M31Fcsw 9
+534609 M31Plmr 4
+534609 M31Tong 185
+534609 NP2 10712
+534609 NP3 148400
+534609 NP5 15722
+534609 SLEpi20M 17
+534609 SV1 9
+534609 TRRsed1 52
+534609 TRRsed2 24
+534609 TRRsed3 20
+534609 TS28 17
+534609 TS29 16
+263307 F21Plmr 2
+263307 NP3 1
+344513 AQC1cm 1
+344513 NP2 2
+344513 NP3 63
+344513 NP5 11
+5552 AQC1cm 97
+5552 AQC4cm 75
+5552 AQC7cm 95
+5552 CC1 811
+5552 CL3 336
+5552 Even1 4
+5552 Even2 1
+5552 Even3 4
+5552 F21Fcsw 1
+5552 F21Plmr 23
+5552 F21Tong 13
+5552 LMEpi24M 1566
+5552 M11Fcsw 1
+5552 M11Plmr 34
+5552 M11Tong 23
+5552 M31Plmr 1
+5552 M31Tong 8
+5552 NP2 2
+5552 NP3 1
+5552 NP5 2
+5552 SLEpi20M 7692
+5552 SV1 896
+5552 TRRsed1 1
+5552 TRRsed2 2
+5552 TRRsed3 2
+561842 AQC4cm 5
+561842 AQC7cm 5
+561842 CC1 346
+561842 CL3 217
+561842 Even1 2
+561842 Even2 1
+561842 Even3 1
+561842 F21Tong 1
+561842 LMEpi24M 1
+561842 SV1 4
+91532 NP2 2
+91532 NP5 1
+340730 TRRsed2 1
+369734 CC1 53
+369734 CL3 9
+369734 Even3 1
+369734 SV1 253
+210865 CC1 7
+210865 CL3 27
+588604 AQC1cm 10
+588604 AQC4cm 8
+588604 AQC7cm 12
+588604 CC1 701
+588604 CL3 1393
+588604 Even1 7
+588604 Even2 1
+588604 LMEpi24M 2
+588604 M11Fcsw 1
+588604 M31Fcsw 3
+588604 NP2 1
+588604 SLEpi20M 2
+588604 SV1 5
+152931 AQC7cm 1
+152931 Even1 1
+152931 LMEpi24M 1
+152931 M31Tong 1
+152931 NP2 5
+152931 NP3 4
+152931 NP5 255
+152931 TRRsed1 1
+548884 AQC1cm 22
+548884 AQC4cm 60
+548884 AQC7cm 54
+548884 CC1 8
+548884 CL3 1
+548884 LMEpi24M 2
+548884 M11Tong 1
+548884 NP3 3
+548884 SV1 5
+548884 TRRsed2 52
+548884 TRRsed3 25
+588899 Even1 1
+588899 TRRsed1 1
+588899 TRRsed2 62
+588899 TRRsed3 102
+588899 TS28 1
+328739 TRRsed1 1
+328739 TRRsed2 60
+328739 TRRsed3 11
+112752 AQC4cm 1
+112752 AQC7cm 2
+112752 CC1 2
+112752 CL3 4
+112752 TRRsed2 1
+144065 AQC1cm 2
+144065 CC1 21
+144065 CL3 191
+144065 SV1 1
+509511 CC1 4
+509511 CL3 3
+266208 CC1 5
+266208 CL3 24
+266208 M11Plmr 2
+240686 CC1 7
+240686 CL3 1
+240686 M11Plmr 1
+240686 M31Plmr 5
+240686 NP3 4
+240686 SV1 9
+240686 TRRsed2 1
+339472 NP3 5
+339472 TRRsed1 2
+339472 TRRsed2 49
+339472 TRRsed3 28
+78316 Even2 1
+78316 SLEpi20M 2
+570817 CC1 7
+570817 M11Plmr 27
+570817 SV1 35
+487725 CC1 4
+487725 CL3 4
+487725 Even3 1
+487725 LMEpi24M 2
+487725 M11Plmr 56
+487725 M31Plmr 3
+487725 SLEpi20M 3
+487725 SV1 52
+254841 CC1 1
+254841 NP3 1
+254841 SV1 1
+511687 AQC1cm 1
+511687 SLEpi20M 6
+210791 CC1 1
+210791 Even1 1
+210791 M11Plmr 4
+114292 CC1 25
+114292 CL3 2
+114292 Even3 2
+114292 M11Plmr 1
+114292 M31Plmr 1
+114292 NP3 3
+114292 SLEpi20M 1
+114292 SV1 150
+114292 TRRsed1 1
+114029 AQC1cm 15
+114029 AQC4cm 54
+114029 AQC7cm 11
+114029 F21Plmr 1
+114029 F21Tong 1
+114029 LMEpi24M 3
+114029 M11Plmr 1
+114029 M11Tong 2
+114029 M31Plmr 1
+114029 M31Tong 15
+114029 NP2 1494
+114029 NP3 1113
+114029 NP5 1462
+114029 TRRsed1 47
+114029 TRRsed2 166
+114029 TRRsed3 19
+114029 TS28 2
+114029 TS29 2
+540283 AQC1cm 5
+540283 AQC4cm 26
+540283 AQC7cm 28
+540283 Even1 1
+540283 NP5 1
+584265 AQC7cm 1
+584265 CC1 16
+584265 M11Plmr 1
+584265 TRRsed1 7
+584265 TRRsed2 35
+584265 TRRsed3 16
+162529 AQC4cm 7
+162529 AQC7cm 9
+103639 TRRsed1 1
+103639 TRRsed2 4
+103639 TRRsed3 5
+314793 TRRsed3 1
+134929 TRRsed1 1
+134929 TRRsed2 2
+134929 TRRsed3 6
+571389 AQC1cm 3
+571389 AQC4cm 6
+571389 AQC7cm 8
+571389 NP5 1
+571389 TRRsed1 9
+571389 TRRsed2 6
+571389 TRRsed3 11
+10857 AQC4cm 8
+10857 AQC7cm 7
+77467 AQC1cm 2
+77467 AQC4cm 6
+77467 AQC7cm 3
+77467 Even1 2
+77467 LMEpi24M 1
+511558 AQC1cm 16
+511558 AQC4cm 33
+511558 AQC7cm 34
+511558 CC1 12
+511558 CL3 12
+511558 Even1 1
+511558 Even3 1
+511558 NP2 1
+511558 SV1 19
+589130 Even1 2
+589130 M31Tong 2
+589130 NP3 29
+589130 NP5 418
+218005 AQC1cm 128
+218005 AQC4cm 70
+218005 AQC7cm 125
+218005 CC1 190
+218005 CL3 13
+218005 Even1 1
+218005 Even2 1
+218005 NP2 1
+218005 SLEpi20M 2
+218005 TRRsed3 1
+241853 NP3 58
+241853 NP5 3
+241853 TRRsed1 1
+241853 TRRsed2 10
+568881 AQC4cm 3
+568881 AQC7cm 8
+136925 AQC4cm 1
+136925 AQC7cm 1
+165369 AQC7cm 1
+236118 AQC7cm 1
+236118 CC1 3
+236118 CL3 19
+236118 TRRsed1 4
+236118 TRRsed2 10
+236118 TRRsed3 6
+237963 AQC1cm 10
+237963 AQC4cm 61
+237963 AQC7cm 51
+237963 CC1 628
+237963 CL3 596
+237963 Even1 1
+237963 Even2 1
+237963 LMEpi24M 1
+237963 M11Fcsw 1
+237963 M11Plmr 1
+237963 M31Plmr 1
+237963 TRRsed2 4
+204177 CC1 55
+204177 CL3 3
+204177 SV1 14
+332464 AQC1cm 3
+332464 AQC4cm 19
+332464 AQC7cm 25
+332464 CC1 53
+332464 CL3 27
+332464 Even1 1
+332464 NP3 1
+332464 NP5 1
+332464 SV1 15
+332464 TRRsed1 1
+332464 TRRsed2 153
+332464 TRRsed3 23
+303295 AQC7cm 4
+303295 CC1 476
+303295 CL3 517
+303295 Even1 4
+303295 Even2 1
+303295 Even3 3
+303295 NP3 1
+303295 SV1 972
+303295 TRRsed1 1
+303295 TRRsed2 3
+303295 TS28 1
+303295 TS29 1
+76933 CC1 8
+76933 CL3 2
+565375 AQC1cm 16
+565375 AQC4cm 118
+565375 AQC7cm 175
+565375 CC1 17
+565375 LMEpi24M 1
+565375 SV1 28
+244674 AQC1cm 1
+244674 CC1 13
+244674 CL3 1
+244674 SLEpi20M 2
+541076 AQC1cm 53
+541076 AQC4cm 283
+541076 AQC7cm 171
+541076 NP2 1
+541076 TRRsed1 1
+541076 TRRsed2 6
+541076 TRRsed3 4
+10660 TRRsed2 5
+10660 TRRsed3 1
+261919 Even1 1
+261919 NP5 1
+143463 AQC1cm 3
+143463 AQC4cm 41
+143463 AQC7cm 29
+143463 M11Fcsw 1
+522127 AQC4cm 1
+258229 LMEpi24M 104
+258229 M11Tong 2
+258229 SLEpi20M 2
+172221 AQC1cm 416
+172221 AQC4cm 2362
+172221 AQC7cm 1922
+172221 Even1 4
+172221 LMEpi24M 4
+172221 M31Plmr 1
+172221 M31Tong 1
+172221 NP2 11
+172221 NP3 3
+172221 NP5 1
+172221 SLEpi20M 8
+172221 TS28 1
+561793 AQC1cm 337
+561793 AQC4cm 1656
+561793 AQC7cm 1443
+561793 F21Plmr 1
+561793 F21Tong 1
+561793 LMEpi24M 7
+561793 M11Tong 1
+561793 NP2 5
+561793 SLEpi20M 6
+561793 TS28 1
+561793 TS29 2
+548838 AQC1cm 5
+548838 AQC4cm 6
+548838 AQC7cm 3
+548838 Even2 1
+548838 Even3 2
+548838 F21Fcsw 1
+548838 F21Tong 6
+548838 LMEpi24M 3
+548838 M11Tong 7
+548838 M31Tong 50
+548838 NP2 75
+548838 NP3 176
+548838 NP5 7659
+548838 SLEpi20M 1
+548838 TRRsed1 10
+548838 TRRsed2 2
+548838 TS28 3
+330803 AQC1cm 4
+330803 NP3 332
+330803 NP5 8
+330803 TRRsed1 1
+330803 TRRsed2 2
+88682 NP3 1
+148678 AQC1cm 2
+148678 AQC4cm 1
+148678 NP5 1
+469382 AQC1cm 69
+469382 AQC4cm 25
+469382 AQC7cm 13
+469382 CC1 20
+469382 Even1 1
+469382 LMEpi24M 3
+469382 M31Plmr 18
+469382 SLEpi20M 7
+287759 AQC1cm 9
+287759 AQC4cm 5
+287759 AQC7cm 15
+287759 CC1 1
+287759 Even3 1
+287759 LMEpi24M 11
+287759 M11Plmr 4
+287759 M31Tong 1
+287759 NP3 3
+287759 NP5 2
+287759 SLEpi20M 3
+287759 SV1 18
+287759 TRRsed1 1
+512157 NP3 16
+512157 TRRsed1 1
+512157 TRRsed2 8
+279132 NP3 2
+279132 NP5 3
+521453 AQC7cm 1
+521453 M11Fcsw 1
+521453 M31Tong 3
+521453 NP2 2
+521453 NP3 70
+521453 NP5 291
+521453 SLEpi20M 1
+521453 TRRsed2 1
+521453 TRRsed3 1
+542003 AQC1cm 1
+542003 M31Tong 1
+542003 NP3 2
+542003 NP5 5
+580835 AQC1cm 1
+580835 Even1 1
+580835 F21Tong 1
+580835 NP3 129
+580835 NP5 7
+35881 NP2 1
+35881 NP3 18
+35881 NP5 4
+262464 F21Plmr 2
+262464 M31Plmr 20
+262464 NP3 1
+342042 CC1 49
+342042 Even2 1
+70461 F21Fcsw 1
+583211 AQC1cm 205
+583211 AQC4cm 787
+583211 AQC7cm 884
+583211 CC1 1
+583211 Even1 2
+583211 LMEpi24M 5
+583211 M11Fcsw 1
+583211 NP2 7
+583211 NP5 2
+583211 SLEpi20M 8
+583211 TS29 1
+544325 AQC1cm 269
+544325 AQC4cm 1726
+544325 AQC7cm 1901
+544325 Even1 1
+544325 Even3 1
+544325 F21Tong 2
+544325 LMEpi24M 4
+544325 M31Tong 1
+544325 NP2 8
+544325 NP3 4
+544325 NP5 1
+544325 SLEpi20M 8
+544325 TRRsed1 1
+511119 AQC1cm 2
+511119 AQC4cm 21
+511119 AQC7cm 17
+511119 M11Tong 1
+511119 NP2 1
+511119 TRRsed2 5
+511119 TRRsed3 37
+288119 CC1 6
+288119 CL3 30
+288119 SV1 10
+552779 F21Fcsw 1083
+552779 M11Tong 2
+552779 M31Plmr 2
+552779 NP3 1
+542129 M11Fcsw 1
+542129 M31Fcsw 2
+554250 CC1 1
+554250 CL3 1
+554250 Even3 1
+554250 M11Fcsw 5
+554250 M31Fcsw 2
+287763 M31Fcsw 2
+470823 M11Fcsw 4
+470823 M11Plmr 2
+470823 M31Fcsw 78
+470823 TS28 6
+298533 CC1 1
+298533 CL3 7
+298533 Even1 1
+298533 SV1 2
+245050 CL3 19
+245050 Even1 2
+245050 M11Tong 2
+245050 M31Fcsw 21
+245050 M31Tong 8
+245050 TS28 1
+343453 M31Plmr 13
+539347 CC1 1
+539347 CL3 3
+539347 Even1 52
+539347 Even2 28
+539347 Even3 35
+539347 F21Fcsw 402
+539347 F21Plmr 2
+539347 F21Tong 59
+539347 M11Fcsw 4
+539347 M11Plmr 4
+539347 M11Tong 125
+539347 M31Fcsw 9
+539347 M31Plmr 223
+539347 M31Tong 36
+539347 SV1 1
+539347 TS28 1
+291739 Even1 7
+291739 Even2 7
+291739 Even3 10
+291739 F21Fcsw 1
+291739 TRRsed3 1
+361304 Even1 1
+361304 F21Fcsw 1
+361304 M11Fcsw 1
+361304 M31Fcsw 169
+361304 TS28 21
+313166 AQC1cm 14
+313166 CC1 11
+313166 CL3 5
+313166 Even1 2421
+313166 Even2 1675
+313166 Even3 318
+313166 F21Fcsw 9
+313166 F21Tong 2
+313166 LMEpi24M 2
+313166 M11Fcsw 182
+313166 M11Plmr 29
+313166 M31Fcsw 2
+313166 M31Tong 3
+313166 NP3 2
+313166 NP5 3
+313166 TRRsed1 2
+313166 TRRsed3 23
+313166 TS28 4
+313166 TS29 1
+114455 AQC1cm 4
+114455 AQC4cm 12
+114455 AQC7cm 16
+114455 CC1 1
+114455 Even1 1
+114455 M11Fcsw 1
+114455 TRRsed2 1
+1941 AQC4cm 1
+1941 F21Plmr 159
+1941 LMEpi24M 1
+1941 M11Plmr 66
+1941 M11Tong 1
+1941 M31Plmr 103
+131765 Even3 1
+131765 M11Fcsw 1
+131765 M11Plmr 107
+131765 M11Tong 1
+131765 M31Plmr 1633
+131765 M31Tong 2
+131765 SV1 1
+131765 TRRsed1 1
+522087 Even1 26
+522087 Even2 33
+522087 Even3 55
+522087 M11Fcsw 2
+522087 M31Plmr 1
+320847 CC1 5
+320847 CL3 25
+320847 SV1 12
+137403 M11Fcsw 2
+137403 M31Fcsw 2
+137403 M31Tong 1
+137403 TS29 1
+335530 AQC1cm 1
+335530 AQC4cm 1
+335530 Even1 1
+335530 Even3 1
+335530 F21Fcsw 570
+335530 M11Fcsw 466
+335530 M11Plmr 1
+335530 M31Fcsw 26
+335530 M31Plmr 1
+335530 M31Tong 2
+335530 SV1 1
+335530 TRRsed2 1
+335530 TRRsed3 3
+335530 TS28 282
+335530 TS29 181
+233805 M31Fcsw 1
+272261 CL3 1
+272261 M31Fcsw 110
+257151 AQC1cm 33
+257151 AQC4cm 51
+257151 AQC7cm 45
+257151 CC1 100
+257151 CL3 27
+257151 Even1 2
+257151 NP5 1
+257151 SLEpi20M 691
+257151 SV1 56
+257151 TRRsed1 2
+257151 TRRsed2 2
+548184 CL3 2
+78203 CC1 3
+78203 CL3 2
+78203 LMEpi24M 3
+78203 NP2 1
+162326 AQC1cm 22
+162326 AQC4cm 11
+162326 AQC7cm 6
+162326 CL3 1
+162326 Even1 3
+162326 Even3 2
+162326 F21Tong 6
+162326 M11Fcsw 1
+162326 M11Plmr 1
+162326 M11Tong 3
+162326 M31Tong 38
+162326 NP2 640
+162326 NP3 2962
+162326 NP5 6283
+162326 TRRsed1 9
+162326 TRRsed2 4
+162326 TRRsed3 1
+162326 TS28 1
+183122 CL3 1
+556145 SLEpi20M 2
+82548 CC1 8
+82548 SLEpi20M 3
+82548 SV1 2
+113510 AQC1cm 1
+113510 CC1 5
+113510 NP3 79
+113510 NP5 2
+113510 SV1 1
+113510 TRRsed1 1
+113510 TRRsed2 1
+276351 SLEpi20M 1
+537429 CC1 3
+342865 AQC1cm 31
+342865 AQC4cm 103
+342865 AQC7cm 56
+342865 CC1 8
+342865 CL3 1
+342865 LMEpi24M 4
+342865 NP2 1
+342865 SV1 10
+342865 TRRsed2 2
+342865 TRRsed3 1
+342865 TS29 1
+550164 CC1 1
+269355 CC1 1
+269355 M11Plmr 1
+588867 CC1 1
+588867 F21Plmr 1
+588867 M11Plmr 1
+588867 SLEpi20M 8
+588867 SV1 16
+153026 CC1 2
+2538 AQC1cm 31
+2538 AQC4cm 3
+2538 AQC7cm 1
+2538 F21Tong 1
+2538 LMEpi24M 11
+2538 M11Plmr 3
+2538 NP3 3
+2538 SLEpi20M 83
+262076 LMEpi24M 1
+262076 NP3 31
+262076 SLEpi20M 11
+55727 NP3 1
+306684 AQC1cm 360
+306684 AQC4cm 429
+306684 AQC7cm 357
+306684 CC1 1097
+306684 CL3 452
+306684 Even1 5
+306684 Even2 3
+306684 M11Plmr 50
+306684 M11Tong 1
+306684 M31Tong 1
+306684 NP2 1
+306684 SLEpi20M 83
+306684 SV1 156
+306684 TRRsed1 1
+306684 TRRsed3 1
+556276 AQC1cm 2
+556276 AQC4cm 2
+556276 AQC7cm 2
+556276 CC1 53
+556276 CL3 7
+556276 Even2 1
+556276 LMEpi24M 1
+556276 SLEpi20M 10
+556276 SV1 1
+578831 SV1 1
+268617 AQC1cm 1
+268617 CC1 18
+268617 CL3 12
+268617 M11Plmr 1
+268617 SV1 1
+268617 TS29 1
+236430 CC1 3
+236430 CL3 1
+591182 AQC1cm 10
+591182 AQC4cm 14
+591182 AQC7cm 5
+591182 CC1 2
+591182 CL3 1
+591182 Even1 4
+591182 Even2 1
+591182 Even3 5
+591182 F21Fcsw 1
+591182 F21Tong 6
+591182 LMEpi24M 4
+591182 M11Fcsw 2
+591182 M11Tong 9
+591182 M31Tong 126
+591182 NP2 73
+591182 NP3 275
+591182 NP5 21030
+591182 SLEpi20M 1
+591182 TRRsed1 30
+591182 TRRsed2 4
+591182 TS28 1
+591182 TS29 4
+260458 CC1 1
+260458 M11Plmr 18
+260458 M31Plmr 1
+260458 SV1 9
+260458 TRRsed3 1
+86556 F21Plmr 17
+86556 M31Plmr 1
+86556 SV1 30
+86556 TRRsed2 1
+259956 TRRsed1 3
+259956 TRRsed3 4
+112749 CL3 68
+304950 AQC4cm 3
+304950 Even1 6
+304950 F21Tong 1
+304950 LMEpi24M 2
+304950 M31Tong 2
+304950 NP5 3
+304950 TRRsed1 182
+304950 TRRsed2 579
+304950 TRRsed3 400
+3581 CL3 6
+3581 SV1 1
+329327 CC1 11
+329327 CL3 87
+329327 Even1 1
+329327 SV1 52
+247518 CC1 6
+247518 TRRsed3 1
+591098 AQC1cm 3
+591098 AQC4cm 16
+591098 AQC7cm 27
+535000 AQC1cm 953
+535000 AQC4cm 2560
+535000 AQC7cm 2478
+535000 Even1 4
+535000 Even2 1
+535000 Even3 1
+535000 F21Tong 4
+535000 LMEpi24M 7
+535000 M11Plmr 1
+535000 M11Tong 3
+535000 NP2 14
+535000 NP3 2
+535000 SLEpi20M 15
+535000 SV1 1
+535000 TRRsed2 2
+535000 TRRsed3 1
+535000 TS28 1
+535000 TS29 3
+73076 AQC1cm 102
+73076 AQC4cm 345
+73076 AQC7cm 297
+73076 CC1 1
+73076 LMEpi24M 2
+73076 M11Fcsw 1
+73076 M31Tong 1
+73076 NP5 1
+73076 TS28 1
+73076 TS29 1
+538794 AQC1cm 38
+538794 AQC4cm 240
+538794 AQC7cm 281
+538794 LMEpi24M 2
+538794 NP2 3
+538794 SLEpi20M 2
+546975 AQC4cm 2
+546975 AQC7cm 2
+546975 CC1 72
+546975 CL3 63
+546975 Even1 1
+546975 M31Plmr 2
+546975 NP3 1
+546975 TS28 1
+341275 AQC1cm 327
+341275 AQC4cm 361
+341275 AQC7cm 580
+341275 CC1 184
+341275 CL3 227
+341275 Even2 3
+341275 F21Tong 1
+341275 M31Fcsw 2
+341275 M31Tong 1
+341275 NP2 3
+341275 NP3 1
+341275 NP5 1
+341275 SLEpi20M 4
+341275 SV1 1
+341275 TS29 1
+512598 AQC1cm 6
+512598 AQC4cm 20
+512598 AQC7cm 31
+512598 CC1 69
+512598 CL3 2
+512598 Even1 1
+512598 Even2 2
+573607 AQC1cm 6
+573607 AQC4cm 2
+573607 AQC7cm 7
+573607 CC1 1818
+573607 CL3 375
+573607 Even1 1
+573607 Even2 4
+573607 M11Plmr 9
+573607 M31Fcsw 3
+573607 SV1 28
+573607 TRRsed3 1
+573607 TS29 1
+113626 AQC4cm 2
+113626 AQC7cm 7
+113626 CC1 1214
+113626 CL3 210
+113626 Even1 3
+113626 Even2 5
+113626 Even3 12
+113626 F21Plmr 3
+113626 M11Plmr 8
+113626 M31Fcsw 1
+113626 M31Plmr 3
+113626 SV1 6650
+113626 TRRsed1 2
+113626 TS28 2
+113626 TS29 1
+223948 CC1 91
+223948 CL3 8
+223948 SV1 25
+223948 TRRsed1 1
+223948 TRRsed2 1
+254706 AQC1cm 17
+254706 AQC4cm 14
+254706 AQC7cm 12
+254706 CC1 117
+254706 CL3 8
+254706 SLEpi20M 1
+254706 SV1 34
+254706 TRRsed3 1
+258006 AQC1cm 4
+258006 AQC4cm 12
+258006 AQC7cm 6
+258006 CC1 13
+258006 CL3 3
+258006 SLEpi20M 1
+258006 SV1 27
+111986 AQC1cm 7
+111986 AQC4cm 2
+111986 AQC7cm 1
+111986 CC1 47
+111986 CL3 7620
+111986 Even1 45
+111986 Even2 1
+111986 Even3 2
+111986 F21Fcsw 1
+111986 F21Tong 2
+111986 LMEpi24M 2
+111986 M11Fcsw 2
+111986 M11Tong 1
+111986 M31Fcsw 4
+111986 M31Tong 1
+111986 NP2 1
+111986 NP3 1
+111986 SV1 3
+111986 TRRsed2 1
+111986 TRRsed3 2
+111986 TS28 1
+111986 TS29 2
+113959 AQC1cm 13
+113959 AQC4cm 15
+113959 AQC7cm 15
+113959 CC1 1879
+113959 CL3 6577
+113959 Even1 41
+113959 Even2 5
+113959 Even3 1
+113959 F21Fcsw 1
+113959 F21Plmr 3
+113959 F21Tong 2
+113959 LMEpi24M 3
+113959 M11Fcsw 2
+113959 M11Plmr 4
+113959 M31Fcsw 3
+113959 M31Tong 2
+113959 NP3 5
+113959 NP5 2
+113959 SLEpi20M 1
+113959 SV1 4
+113959 TRRsed1 1
+113959 TRRsed2 1
+113959 TRRsed3 1
+113959 TS28 2
+113959 TS29 2
+280233 CC1 2
+280233 CL3 12
+553285 TRRsed2 2
+553285 TRRsed3 8
+238997 CL3 1
+212596 AQC1cm 2
+212596 AQC4cm 8
+212596 AQC7cm 3
+212596 CC1 284
+212596 CL3 1420
+212596 Even1 3
+212596 Even2 2
+212596 F21Tong 1
+212596 SV1 1
+212596 TRRsed3 1
+204462 AQC7cm 6
+204462 CC1 279
+204462 CL3 1027
+204462 Even1 4
+204462 Even2 1
+204462 M11Fcsw 1
+204462 M31Plmr 1
+204462 M31Tong 1
+204462 TRRsed3 1
+589802 AQC1cm 4
+589802 AQC4cm 32
+589802 AQC7cm 34
+589802 CC1 35
+589802 CL3 245
+589802 Even1 1
+589802 Even2 2
+589802 Even3 1
+589802 F21Tong 1
+589802 M31Fcsw 1
+589802 SV1 4
+589802 TRRsed2 51
+589802 TRRsed3 47
+272782 AQC1cm 14
+272782 AQC4cm 79
+272782 AQC7cm 82
+272782 CC1 2
+272782 CL3 7
+272782 NP2 2
+272782 TRRsed2 1
+272782 TRRsed3 6
+229514 CL3 1
+250451 AQC4cm 1
+250451 CC1 9
+250451 CL3 14
+250451 SV1 1
+250451 TRRsed3 1
+574961 AQC1cm 1
+139346 AQC1cm 1
+139346 AQC4cm 1
+139346 AQC7cm 2
+139346 CC1 12
+139346 CL3 3
+588882 AQC1cm 1
+588882 TRRsed1 5
+588882 TRRsed2 25
+138816 AQC4cm 1
+138816 CC1 4
+138816 CL3 3
+138816 SV1 10
+28512 CL3 1
+28512 SV1 3
+279590 AQC1cm 12
+279590 AQC4cm 27
+279590 AQC7cm 22
+279590 CC1 1132
+279590 CL3 597
+279590 Even1 2
+279590 Even2 3
+279590 SV1 8
+329074 AQC1cm 1
+329074 AQC7cm 1
+329074 CC1 19
+329074 CL3 295
+329074 Even1 2
+329074 M31Fcsw 1
+329074 TRRsed3 1
+581733 AQC1cm 162
+581733 AQC4cm 349
+581733 AQC7cm 240
+581733 CL3 1
+581733 LMEpi24M 3
+581733 M31Tong 2
+581733 NP2 2
+581733 NP3 1
+581733 SLEpi20M 4
+581733 TRRsed2 3
+581733 TRRsed3 2
+1718 SV1 7
+582971 AQC1cm 1
+582971 F21Tong 1
+508980 AQC1cm 9
+508980 AQC4cm 32
+508980 AQC7cm 48
+508980 LMEpi24M 1
+508980 M11Tong 1
+508980 NP2 1
+63221 AQC1cm 113
+63221 AQC4cm 383
+63221 AQC7cm 480
+63221 Even1 1
+63221 Even3 1
+63221 F21Plmr 1
+63221 LMEpi24M 2
+63221 NP2 5
+63221 NP3 1
+63221 SLEpi20M 7
+63221 TS29 1
+577111 AQC1cm 78
+577111 AQC4cm 508
+577111 AQC7cm 551
+577111 Even1 1
+577111 LMEpi24M 5
+577111 NP2 4
+577111 NP5 1
+577111 SLEpi20M 5
+136740 AQC1cm 1
+136740 AQC4cm 15
+136740 AQC7cm 15
+136740 CC1 6
+136740 CL3 5
+136740 NP2 1
+136740 SV1 233
+144091 AQC1cm 398
+144091 AQC4cm 1331
+144091 AQC7cm 1305
+144091 CC1 150
+144091 CL3 439
+144091 Even1 4
+144091 F21Tong 2
+144091 LMEpi24M 12
+144091 M11Fcsw 1
+144091 M11Plmr 1
+144091 M11Tong 1
+144091 M31Tong 1
+144091 NP2 8
+144091 NP3 1
+144091 SLEpi20M 17
+144091 TRRsed2 1
+144091 TRRsed3 1
+239291 LMEpi24M 1
+239291 TRRsed2 1
+244496 AQC1cm 482
+244496 AQC4cm 115
+244496 AQC7cm 85
+244496 Even1 10
+244496 F21Fcsw 1
+244496 F21Tong 2
+244496 LMEpi24M 1
+244496 M11Tong 1
+244496 M31Tong 2
+244496 NP2 1
+244496 NP3 2
+244496 NP5 2
+244496 SLEpi20M 4
+244496 TRRsed3 1
+244496 TS28 1
+206632 AQC1cm 20
+206632 AQC4cm 15
+206632 AQC7cm 20
+206632 CC1 2704
+206632 CL3 1202
+206632 Even1 2
+206632 Even2 3
+206632 Even3 1
+206632 F21Tong 1
+206632 LMEpi24M 1
+206632 M11Fcsw 1
+206632 M11Plmr 16
+206632 M11Tong 1
+206632 SLEpi20M 1
+206632 SV1 1
+206632 TS28 1
+222914 AQC1cm 43
+222914 AQC4cm 135
+222914 AQC7cm 124
+222914 CC1 387
+222914 CL3 2808
+222914 Even1 8
+222914 F21Fcsw 1
+222914 LMEpi24M 1
+222914 M31Fcsw 1
+222914 M31Tong 1
+222914 NP5 1
+222914 SV1 2
+242989 AQC1cm 2
+242989 AQC4cm 29
+242989 AQC7cm 26
+242989 CC1 19
+242989 CL3 237
+242989 Even1 3
+238929 AQC1cm 21
+238929 AQC4cm 171
+238929 AQC7cm 94
+238929 CC1 33
+238929 CL3 163
+238929 Even1 2
+238929 Even3 2
+238929 F21Fcsw 1
+238929 NP2 1
+238929 SLEpi20M 2
+238929 SV1 31
+238929 TRRsed1 1
+238929 TRRsed2 1
+238929 TRRsed3 1
+155970 CL3 5
+137099 CC1 12
+137099 CL3 91
+137099 Even1 1
+137099 SV1 132
+137099 TRRsed2 1
+533173 AQC7cm 1
+544430 CC1 6
+544430 CL3 3
+309328 TRRsed2 1
+588740 AQC4cm 1
+588740 AQC7cm 1
+588740 CC1 85
+588740 CL3 54
+588740 Even2 1
+588740 SV1 3
+200064 AQC1cm 1335
+200064 AQC4cm 860
+200064 AQC7cm 659
+200064 CL3 1
+200064 Even1 10
+200064 Even2 1
+200064 Even3 2
+200064 LMEpi24M 6
+200064 M11Tong 2
+200064 M31Plmr 1
+200064 M31Tong 1
+200064 NP2 2
+200064 NP3 7
+200064 NP5 1
+200064 SLEpi20M 3
+200064 TS29 7
+582509 AQC1cm 38
+582509 AQC4cm 32
+582509 AQC7cm 27
+582509 CC1 63
+582509 CL3 2
+582509 LMEpi24M 1
+582509 TRRsed2 37
+582509 TRRsed3 26
+582509 TS29 1
+524765 CC1 1
+524765 TRRsed2 7
+524765 TRRsed3 2
+259028 AQC1cm 1
+259028 AQC4cm 2
+259028 AQC7cm 1
+259028 Even1 2
+259028 TRRsed1 29
+259028 TRRsed2 11
+259028 TRRsed3 46
+533036 AQC4cm 4
+533036 AQC7cm 1
+200049 TRRsed1 3
+200049 TRRsed2 22
+200049 TRRsed3 8
+179213 AQC4cm 1
+179213 AQC7cm 2
+179213 TRRsed3 3
+2783 LMEpi24M 1
+352654 AQC1cm 104
+352654 AQC4cm 9
+352654 AQC7cm 12
+352654 CC1 5
+352654 Even2 1
+352654 LMEpi24M 70
+352654 M11Tong 1
+352654 M31Tong 1
+352654 SLEpi20M 78
+352654 SV1 4
+352654 TRRsed1 1
+352654 TRRsed3 1
+269075 TRRsed3 2
+542353 AQC7cm 2
+244840 AQC4cm 2
+244840 AQC7cm 3
+244840 CC1 22
+244840 CL3 35
+244840 SV1 27
+513639 AQC1cm 2
+513639 AQC4cm 1
+513639 CC1 21
+513639 CL3 15
+513639 SV1 9
+513639 TRRsed2 1
+278955 AQC1cm 22
+278955 AQC4cm 1
+278955 AQC7cm 1
+278955 CC1 67
+278955 CL3 7
+278955 Even1 2
+278955 SV1 1
+321885 AQC4cm 1
+321885 CC1 161
+321885 CL3 108
+321885 Even2 1
+321885 SV1 1
+321885 TS28 1
+567145 AQC4cm 1
+567145 CL3 1
+572242 AQC1cm 29
+572242 AQC4cm 6
+572242 AQC7cm 10
+572242 CC1 262
+572242 CL3 364
+572242 Even1 2
+572242 Even2 1
+572242 LMEpi24M 2
+572242 NP3 1
+572242 SV1 10
+329570 CC1 3
+329570 CL3 5
+329570 SV1 1
+239490 AQC1cm 3
+239490 AQC4cm 1
+239490 CC1 54
+239490 CL3 6
+239490 SV1 1
+239490 TRRsed3 1
+213873 AQC7cm 2
+213873 CC1 75
+213873 CL3 23
+213873 M31Tong 1
+213873 SV1 1
+554003 AQC1cm 2
+554003 AQC4cm 5
+554003 AQC7cm 5
+554003 Even1 2
+554003 LMEpi24M 1
+554003 M11Tong 1
+554003 NP2 1
+554003 NP3 1
+554003 NP5 6
+554003 TRRsed1 130
+554003 TRRsed2 287
+554003 TRRsed3 372
+182894 M31Fcsw 2
+182894 TS28 1
+182894 TS29 29
+88335 CC1 21
+88335 CL3 9
+88335 Even2 1
+88335 Even3 2
+88335 M11Plmr 1
+88335 SV1 722
+88335 TRRsed1 1
+89763 AQC4cm 2
+89763 AQC7cm 2
+89763 CC1 204
+89763 CL3 35
+89763 M11Plmr 1
+89763 SV1 313
+89763 TRRsed1 1
+89763 TRRsed2 3
+89763 TRRsed3 3
+248299 AQC1cm 1
+248299 CC1 31
+248299 CL3 113
+248299 F21Plmr 2
+248299 M11Plmr 1
+248299 M31Plmr 9
+248299 SV1 14
+136780 AQC1cm 1
+136780 AQC4cm 1
+136780 AQC7cm 1
+136780 CC1 2
+368027 CC1 70
+368027 CL3 34
+368027 F21Plmr 1
+368027 M11Plmr 1
+368027 SV1 86
+368027 TRRsed2 1
+106480 AQC1cm 1
+106480 AQC4cm 2
+106480 AQC7cm 2
+296956 CC1 2
+319724 CC1 10
+319724 CL3 31
+319724 Even2 1
+254133 CC1 6
+254133 CL3 5
+254133 SV1 12
+70379 M11Plmr 10
+70379 M11Tong 1
+70379 M31Tong 8
+65695 CC1 1
+65695 CL3 2
+534516 Even3 1
+534516 F21Plmr 1
+534516 SV1 107
+206278 AQC1cm 4792
+206278 AQC4cm 2918
+206278 AQC7cm 1939
+206278 CC1 30
+206278 CL3 35
+206278 Even1 30
+206278 Even2 2
+206278 F21Fcsw 1
+206278 F21Tong 7
+206278 LMEpi24M 5
+206278 M11Plmr 2
+206278 M11Tong 1
+206278 M31Tong 6
+206278 NP2 17
+206278 NP3 6
+206278 NP5 2
+206278 SLEpi20M 15
+206278 TRRsed2 5
+206278 TS28 2
+206278 TS29 20
+218985 AQC1cm 72
+218985 AQC4cm 168
+218985 AQC7cm 118
+218985 CC1 174
+218985 CL3 14
+218985 LMEpi24M 2
+218985 M31Plmr 4
+218985 NP3 3
+218985 SV1 335
+218985 TRRsed1 1
+218985 TRRsed2 1
+240228 CC1 68
+240228 CL3 17
+261663 AQC1cm 1
+261663 AQC4cm 3
+261663 AQC7cm 4
+261663 CC1 29
+261663 CL3 179
+261663 Even1 2
+261663 Even3 1
+261663 NP3 2
+261663 SV1 23
+513763 AQC1cm 199
+513763 AQC4cm 2230
+513763 AQC7cm 2110
+513763 Even1 2
+513763 Even2 2
+513763 F21Tong 3
+513763 LMEpi24M 3
+513763 M11Fcsw 1
+513763 M11Tong 2
+513763 M31Fcsw 1
+513763 M31Tong 1
+513763 NP2 10
+513763 NP5 1
+513763 SLEpi20M 10
+533078 AQC1cm 9
+533078 AQC4cm 7
+533078 AQC7cm 9
+533078 CC1 16
+533078 CL3 1
+533078 Even2 1
+509572 AQC1cm 12
+509572 AQC4cm 58
+509572 AQC7cm 50
+509572 Even3 1
+509572 LMEpi24M 1
+509572 NP3 1
+509572 TRRsed1 3
+509572 TRRsed2 1
+509572 TRRsed3 6
+521924 AQC7cm 1
+521924 TRRsed3 1
+536982 TRRsed2 1
+278709 AQC1cm 13
+278709 AQC4cm 86
+278709 AQC7cm 91
+278709 LMEpi24M 1
+278709 NP3 3
+278709 SLEpi20M 1
+278709 TRRsed1 6
+278709 TRRsed2 111
+278709 TRRsed3 43
+304270 CC1 95
+304270 CL3 6
+262115 CC1 20
+262115 F21Plmr 3
+262115 M11Plmr 4
+262115 SV1 219
+68350 F21Plmr 3
+68350 F21Tong 5
+68350 M11Plmr 3
+68350 M31Plmr 5
+68350 M31Tong 2
+112263 AQC1cm 5
+112263 AQC7cm 4
+112263 LMEpi24M 1
+112263 TRRsed2 2
+215890 AQC1cm 2
+215890 CL3 1
+215890 Even2 1
+215890 Even3 1
+215890 F21Plmr 8
+215890 F21Tong 2
+215890 LMEpi24M 1
+215890 M11Fcsw 22
+215890 M11Plmr 86
+215890 M31Fcsw 3601
+215890 M31Plmr 7
+215890 M31Tong 1
+215890 TRRsed3 1
+215890 TS28 9
+215890 TS29 1
+157625 NP3 13
+157625 TRRsed3 1
+560842 AQC7cm 1
+591496 AQC1cm 35
+591496 AQC4cm 230
+591496 AQC7cm 190
+591496 CC1 3
+591496 CL3 3
+591496 NP2 1
+591496 SLEpi20M 2
+591496 TRRsed2 7
+591496 TRRsed3 9
+167215 TRRsed3 1
+185339 F21Fcsw 12
+185339 M11Fcsw 1
+185339 M31Fcsw 3
+570930 F21Fcsw 83
+570930 LMEpi24M 1
+570930 M11Fcsw 28
+570930 M31Fcsw 28
+289855 SV1 1
+289855 TRRsed3 1
+313089 CL3 1
+313089 F21Fcsw 173
+50120 AQC7cm 4
+50120 CL3 6
+50120 Even1 1
+578409 AQC1cm 30
+578409 AQC4cm 46
+578409 AQC7cm 68
+578409 CC1 6
+578409 NP2 1
+69664 M11Fcsw 2
+69664 M11Tong 1
+69664 M31Fcsw 291
+69664 M31Tong 1
+69664 TS28 1
+109792 F21Fcsw 3
+109792 M31Fcsw 1
+526490 F21Fcsw 1
+526490 M31Fcsw 4
+510817 Even1 1
+510817 F21Fcsw 42
+510817 M11Fcsw 87
+510817 M31Fcsw 1
+510817 M31Tong 1
+510817 TRRsed3 5
+510817 TS28 424
+510817 TS29 10
+538805 CC1 1
+538805 CL3 20
+538805 F21Fcsw 27
+538805 M11Fcsw 1
+538805 M31Fcsw 6
+538805 SV1 3
+334839 F21Fcsw 428
+334839 M11Fcsw 1
+334839 M11Plmr 1
+334839 M31Fcsw 4
+334839 TS29 7
+273850 CL3 3
+273850 F21Fcsw 1
+273850 M31Fcsw 2
+16072 AQC4cm 1
+16072 Even1 1
+16072 Even3 1
+16072 F21Fcsw 4
+16072 M11Fcsw 1
+183985 F21Fcsw 2
+183985 M11Fcsw 1
+183985 M31Fcsw 1
+183985 SV1 1
+214380 F21Fcsw 56
+214380 M11Fcsw 1
+262658 TS28 1
+197095 F21Fcsw 2
+197095 TS28 1
+296377 CC1 2
+296377 CL3 1
+296377 Even1 2
+296377 F21Fcsw 1133
+296377 M11Fcsw 283
+296377 M31Fcsw 1
+296377 M31Plmr 1
+296377 NP3 1
+296377 TRRsed2 1
+296377 TRRsed3 4
+296377 TS28 352
+296377 TS29 3
+25695 F21Fcsw 4
+25695 M31Fcsw 2
+25695 SV1 1
+556462 F21Fcsw 110
+556462 M11Fcsw 63
+556462 M11Plmr 6
+350731 F21Fcsw 4
+590547 SV1 1
+322112 Even3 1
+36624 F21Fcsw 3
+36624 F21Plmr 2
+36624 M11Fcsw 1
+36624 M31Fcsw 2
+350732 Even3 1
+350732 F21Tong 2
+350732 M31Fcsw 2
+193230 Even1 3
+193230 Even2 4
+193230 F21Fcsw 62
+193230 M31Fcsw 1
+193230 TS29 1
+336539 M11Fcsw 4
+336539 M31Fcsw 1
+336539 TS28 1
+192899 Even2 1
+192899 Even3 2
+192899 M31Fcsw 5
+192899 M31Plmr 1
+192899 TS28 2
+192899 TS29 14
+182728 M31Fcsw 9
+182728 TS28 3
+182728 TS29 33
+535209 TS28 1
+36647 Even1 1
+36647 F21Fcsw 3
+36647 F21Tong 1
+36647 M31Tong 1
+305437 Even2 1
+305437 M11Fcsw 1
+305437 M31Fcsw 4
+305437 TS28 1
+541602 Even1 5
+541602 Even2 2
+541602 Even3 3
+541602 M11Fcsw 1
+541602 M31Fcsw 44
+541602 TS28 2
+329313 Even1 1
+329313 TS28 1
+193343 AQC1cm 6
+193343 AQC4cm 1
+193343 AQC7cm 2
+193343 CC1 1
+193343 CL3 4
+193343 Even1 1170
+193343 Even2 700
+193343 Even3 1030
+193343 F21Fcsw 593
+193343 F21Plmr 2
+193343 F21Tong 1
+193343 M11Fcsw 4059
+193343 M11Plmr 43
+193343 M31Fcsw 453
+193343 M31Plmr 4
+193343 M31Tong 17
+193343 NP3 1
+193343 SV1 2
+193343 TRRsed1 1
+193343 TRRsed2 2
+193343 TRRsed3 88
+193343 TS28 7845
+193343 TS29 850
+333279 CC1 1
+333279 F21Fcsw 4
+291764 SV1 1
+192952 Even1 1
+192952 Even2 2
+192952 Even3 5
+192952 M31Fcsw 3
+192952 TS28 4
+192952 TS29 4
+136044 AQC1cm 3
+136044 Even1 1
+136044 Even2 1
+136044 F21Fcsw 1
+136044 M31Fcsw 3
+136044 M31Plmr 1
+136044 TS28 22
+136044 TS29 471
+276703 CC1 1
+276703 CL3 9
+276703 F21Tong 1
+276703 M31Fcsw 1
+276703 TS29 1
+237324 AQC1cm 1
+237324 M11Fcsw 3
+237324 M31Fcsw 12
+237324 TS28 116
+237324 TS29 45
+326644 F21Fcsw 2
+228762 Even1 6
+228762 Even2 7
+228762 Even3 5
+228762 F21Tong 3
+228762 M11Fcsw 4
+228762 M31Fcsw 1
+228762 M31Tong 2
+228762 TS28 3
+228762 TS29 2
+198528 CC1 2
+198528 CL3 1
+198528 F21Fcsw 7
+198528 M11Fcsw 3
+198528 TS28 101
+198528 TS29 1
+167365 AQC1cm 1
+167365 CC1 1
+167365 Even1 20
+167365 Even2 14
+167365 Even3 23
+167365 M11Fcsw 1
+167365 M31Fcsw 156
+167365 M31Plmr 13
+167365 SV1 1
+167365 TS28 1
+239053 CL3 1
+239053 Even3 1
+239053 M11Fcsw 11
+239053 M31Fcsw 22
+239053 NP5 1
+239053 TRRsed3 1
+239053 TS28 5
+239053 TS29 4
+526081 AQC1cm 4
+526081 CC1 5
+526081 Even1 348
+526081 Even2 271
+526081 Even3 275
+526081 F21Fcsw 7
+526081 F21Plmr 2
+526081 F21Tong 4
+526081 M11Fcsw 27
+526081 M31Fcsw 85
+526081 M31Plmr 4
+526081 M31Tong 2
+526081 TRRsed1 1
+526081 TRRsed3 1
+526081 TS28 32
+526081 TS29 504
+129783 Even1 8
+129783 Even2 9
+129783 Even3 13
+129783 M11Fcsw 1
+198740 F21Fcsw 2
+198740 M11Fcsw 4
+198740 M31Fcsw 4
+198740 TS28 6
+198740 TS29 3
+583500 TS28 1
+333775 AQC1cm 1
+333775 Even1 46
+333775 Even2 23
+333775 Even3 24
+333775 M31Fcsw 13
+333775 TS28 1
+333775 TS29 1
+193576 M11Fcsw 1
+292073 Even1 3
+292073 Even2 5
+292073 Even3 5
+292073 SV1 1
+234421 AQC1cm 3
+234421 AQC4cm 1
+234421 AQC7cm 1
+234421 CL3 1
+234421 Even1 286
+234421 Even2 212
+234421 Even3 340
+234421 LMEpi24M 2
+234421 M11Fcsw 7
+234421 M31Fcsw 336
+234421 M31Tong 1
+234421 TRRsed1 1
+234421 TRRsed2 1
+234421 TRRsed3 6
+234421 TS28 161
+234421 TS29 325
+307569 Even3 1
+307569 LMEpi24M 1
+307569 TS28 2
+307569 TS29 15
+292556 CC1 1
+292556 CL3 7
+292556 SV1 1
+345211 F21Fcsw 1
+345211 M11Fcsw 1
+345211 TS28 2
+345211 TS29 1
+189485 AQC1cm 13
+189485 F21Fcsw 1
+189485 M11Tong 1
+189485 M31Tong 1
+189485 TRRsed3 1
+189485 TS28 14
+189485 TS29 2075
+514272 Even3 1
+514272 F21Fcsw 30
+514272 M11Fcsw 12
+514272 M31Fcsw 15
+514272 TS28 23
+514272 TS29 3
+277196 Even1 1
+277196 Even2 1
+277196 M11Fcsw 1
+277196 M31Fcsw 1
+277196 TS28 1
+184403 AQC1cm 9
+184403 AQC4cm 1
+184403 AQC7cm 2
+184403 CC1 4
+184403 CL3 4
+184403 Even1 4422
+184403 Even2 3444
+184403 Even3 3573
+184403 F21Tong 2
+184403 LMEpi24M 1
+184403 M11Fcsw 77
+184403 M31Fcsw 62
+184403 M31Tong 5
+184403 NP3 1
+184403 SLEpi20M 1
+184403 SV1 3
+184403 TRRsed1 5
+184403 TRRsed3 35
+184403 TS28 84
+184403 TS29 13
+295754 CL3 2
+295754 M31Fcsw 1
+291710 CL3 4
+291710 TRRsed2 1
+291710 TRRsed3 2
+298252 Even1 1
+298252 F21Fcsw 1
+298252 F21Tong 1
+298252 M11Fcsw 1
+298252 M31Plmr 2
+298252 M31Tong 1
+298252 TS29 5
+346253 Even1 3
+346253 Even2 5
+346253 Even3 8
+346253 F21Fcsw 73
+346253 M11Fcsw 6
+346253 M31Fcsw 4
+346253 TRRsed3 1
+346253 TS28 1
+212619 AQC1cm 14
+212619 AQC4cm 8
+212619 AQC7cm 1
+212619 CC1 2
+212619 CL3 2
+212619 Even1 88
+212619 Even2 42
+212619 Even3 67
+212619 F21Fcsw 1369
+212619 F21Plmr 2
+212619 F21Tong 3
+212619 LMEpi24M 3
+212619 M11Fcsw 636
+212619 M11Plmr 66
+212619 M11Tong 7
+212619 M31Fcsw 4671
+212619 M31Plmr 7
+212619 M31Tong 30
+212619 NP3 4
+212619 NP5 5
+212619 SLEpi20M 4
+212619 SV1 2
+212619 TRRsed2 1
+212619 TRRsed3 142
+212619 TS28 10680
+212619 TS29 1361
+527741 Even1 1
+527741 F21Fcsw 12
+527741 M11Fcsw 16
+527741 M11Plmr 1
+527741 M31Fcsw 42
+527741 TS28 56
+527741 TS29 31
+171551 AQC1cm 383
+171551 AQC4cm 31
+171551 AQC7cm 20
+171551 CC1 48
+171551 CL3 189
+171551 Even1 172
+171551 Even2 26
+171551 Even3 51
+171551 F21Fcsw 18387
+171551 F21Plmr 11
+171551 F21Tong 35
+171551 LMEpi24M 25
+171551 M11Fcsw 13671
+171551 M11Plmr 25
+171551 M11Tong 33
+171551 M31Fcsw 50756
+171551 M31Plmr 31
+171551 M31Tong 190
+171551 NP2 6
+171551 NP3 11
+171551 NP5 29
+171551 SLEpi20M 15
+171551 SV1 55
+171551 TRRsed1 2
+171551 TRRsed2 6
+171551 TRRsed3 1008
+171551 TS28 89061
+171551 TS29 59615
+579000 F21Fcsw 30
+579000 M31Fcsw 6
+563574 TS28 2
+165257 M11Fcsw 1
+165257 TS28 1
+291610 CL3 1
+291610 F21Fcsw 4
+338272 F21Fcsw 34
+338272 M11Fcsw 2
+338272 SV1 1
+237003 TS28 1
+207832 AQC4cm 1
+207832 AQC7cm 2
+207832 M31Plmr 15
+207832 TRRsed2 1
+31334 CC1 1
+31334 CL3 1
+31334 M11Plmr 2
+31334 TRRsed2 1
+31334 TRRsed3 1
+70528 F21Plmr 2
+70528 M31Plmr 1
+566717 M31Plmr 1
+157866 TRRsed2 1
+154106 AQC1cm 3
+154106 AQC4cm 1
+154106 F21Plmr 13
+154106 F21Tong 128
+154106 LMEpi24M 1
+154106 M11Plmr 4
+154106 M11Tong 4
+154106 M31Fcsw 1
+154106 M31Plmr 197
+154106 M31Tong 26
+154106 NP5 1
+561594 AQC4cm 1
+561594 AQC7cm 1
+561594 F21Tong 1
+561594 M31Plmr 2
+561594 M31Tong 3
+137557 M31Plmr 1
+115230 M31Tong 1
+470527 F21Plmr 2
+470527 M11Plmr 3
+470527 M31Plmr 1
+471308 M11Plmr 2
+471308 NP5 1
+470599 F21Plmr 5
+470599 M11Fcsw 1
+470599 M11Plmr 154
+134207 AQC4cm 1
+134207 CC1 2
+134207 F21Plmr 1
+258375 F21Plmr 1
+258375 M31Fcsw 14
+585355 CC1 2
+26036 AQC7cm 1
+258502 CC1 2
+258502 TRRsed2 1
+258502 TRRsed3 1
+13662 TRRsed1 1
diff --git a/inst/extdata/gp500test.tree b/inst/extdata/gp500test.tree
new file mode 100644
index 0000000..9c8cfd4
--- /dev/null
+++ b/inst/extdata/gp500test.tree
@@ -0,0 +1 @@
+(((((153762:0.05351,175045:0.03915)0.695:0.176,71074:0.06636)0.985.6:0.15875,525569:0.1554)0.943.7:0.07877,557121:0.30347)0.858.4:0.13385,((((((((((((((((((560734:0.07403,(341901:0.03694,286030:0.06912)0.804.32:0.01109)0.969.24:0.0387,(((275402:0.0187,321069:0.01593)0.930.83:0.11634,(((161544:0.07894,215003:0.08119)0.981.34:0.00914,548077:0.03662)0.923.111:0.00878,(257176:0.03228,100157:0.03011)0.977.38:0.04055)0.998.88:0.01699)0.985.24:0.04029,((576085:0.00893,100757:0.02249)1.000.1367: [...]
diff --git a/inst/extdata/master_map.txt b/inst/extdata/master_map.txt
new file mode 100644
index 0000000..87fd619
--- /dev/null
+++ b/inst/extdata/master_map.txt
@@ -0,0 +1,29 @@
+#SampleID Primer Final_Barcode Barcode_truncated_plus_T Barcode_full_length SampleType Description
+CL3 ILBC_01 AACGCA TGCGTT CTAGCGTGCGT Soil "Calhoun South Carolina Pine soil, pH 4.9"
+CC1 ILBC_02 AACTCG CGAGTT CATCGACGAGT Soil "Cedar Creek Minnesota, grassland, pH 6.1"
+SV1 ILBC_03 AACTGT ACAGTT GTACGCACAGT Soil "Sevilleta new Mexico, desert scrub, pH 8.3"
+M31Fcsw ILBC_04 AAGAGA TCTCTT TCGACATCTCT Feces "M3, Day 1, fecal swab, whole body study"
+M11Fcsw ILBC_05 AAGCTG CAGCTT CGACTGCAGCT Feces "M1, Day 1, fecal swab, whole body study "
+F11Fcsw ILBC_06 AATCAG CTGATT ACGAGACTGAT Feces "F1, Day 1, fecal swab, whole body study "
+M31Plmr ILBC_07 AATCGT ACGATT CGAGTCACGAT Skin "M3, Day 1, right palm, whole body study"
+M11Plmr ILBC_08 ACACAC GTGTGT GCCATAGTGTG Skin "M1, Day 1, right palm, whole body study "
+F21Plmr ILBC_09 ACACAT ATGTGT GTAGACATGTG Skin "F1, Day 1, right palm, whole body study "
+M31Tong ILBC_10 ACACGA TCGTGT TGTGGCTCGTG Tongue "M3, Day 1, tongue, whole body study "
+M11Tong ILBC_11 ACACGG CCGTGT TAGACACCGTG Tongue "M1, Day 1, tongue, whole body study "
+F11Tong ILBC_12 ACACTA TAGTGT CGGATCTAGTG Tongue "F1, Day 1, tongue, whole body study "
+LMEpi24M ILBC_13 ACACTG CAGTGT CATGAACAGTG Freshwater "Lake Mendota Minnesota, 24 meter epilimnion "
+SLEpi20M ILBC_15 ACAGAG CTCTGT AGCCGACTCTG Freshwater "Sparkling Lake Wisconsin, 20 meter eplimnion"
+AQC1cm ILBC_16 ACAGCA TGCTGT GACCACTGCTG Freshwater (creek) "Allequash Creek, 0-1cm depth"
+AQC4cm ILBC_17 ACAGCT AGCTGT CAAGCTAGCTG Freshwater (creek) "Allequash Creek, 3-4 cm depth"
+AQC7cm ILBC_18 ACAGTG CACTGT ATGAAGCACTG Freshwater (creek) "Allequash Creek, 6-7 cm depth"
+NP2 ILBC_19 ACAGTT AACTGT TCGCGCAACTG Ocean "Newport Pier, CA surface water, Time 1"
+NP3 ILBC_20 ACATCA TGATGT GCTAAGTGATG Ocean "Newport Pier, CA surface water, Time 2"
+NP5 ILBC_21 ACATGA TCATGT GAACGATCATG Ocean "Newport Pier, CA surface water, Time 3"
+TRRsed1 ILBC_22 ACATGT ACATGT CACGTGACATG Sediment (estuary) "Tijuana River Reserve, depth 1"
+TRRsed2 ILBC_23 ACATTC GAATGT TGCGCTGAATG Sediment (estuary) "Tijuana River Reserve, depth 2"
+TRRsed3 ILBC_24 ACCACA TGTGGT GATGTATGTGG Sediment (estuary) "Tijuana River Reserve, depth 2"
+TS28 ILBC_25 ACCAGA TCTGGT GCATCGTCTGG Feces Twin #1
+TS29 ILBC_26 ACCAGC GCTGGT CTAGTCGCTGG Feces Twin #2
+Even1 ILBC_27 ACCGCA TGCGGT TGACTCTGCGG Mock Even1
+Even2 ILBC_28 ACCTCG CGAGGT TCTGATCGAGG Mock Even2
+Even3 ILBC_29 ACCTGT ACAGGT AGAGAGACAGG Mock Even3
\ No newline at end of file
diff --git a/inst/extdata/min_dense_otu_table.biom b/inst/extdata/min_dense_otu_table.biom
new file mode 100644
index 0000000..2baaf1a
--- /dev/null
+++ b/inst/extdata/min_dense_otu_table.biom
@@ -0,0 +1,32 @@
+ {
+ "id":null,
+ "format": "Biological Observation Matrix 1.0.0-dev",
+ "format_url": "http://biom-format.org",
+ "type": "OTU table",
+ "generated_by": "QIIME revision XYZ",
+ "date": "2011-12-19T19:00:00",
+ "rows":[
+ {"id":"GG_OTU_1", "metadata":null},
+ {"id":"GG_OTU_2", "metadata":null},
+ {"id":"GG_OTU_3", "metadata":null},
+ {"id":"GG_OTU_4", "metadata":null},
+ {"id":"GG_OTU_5", "metadata":null}
+ ],
+ "columns": [
+ {"id":"Sample1", "metadata":null},
+ {"id":"Sample2", "metadata":null},
+ {"id":"Sample3", "metadata":null},
+ {"id":"Sample4", "metadata":null},
+ {"id":"Sample5", "metadata":null},
+ {"id":"Sample6", "metadata":null}
+ ],
+ "matrix_type": "dense",
+ "matrix_element_type": "int",
+ "shape": [5,6],
+ "data": [[0,0,1,0,0,0],
+ [5,1,0,2,3,1],
+ [0,0,1,4,2,0],
+ [2,1,1,0,0,1],
+ [0,1,1,0,0,0]]
+ }
+
diff --git a/inst/extdata/min_sparse_otu_table.biom b/inst/extdata/min_sparse_otu_table.biom
new file mode 100644
index 0000000..d84e039
--- /dev/null
+++ b/inst/extdata/min_sparse_otu_table.biom
@@ -0,0 +1,43 @@
+ {
+ "id":null,
+ "format": "Biological Observation Matrix 1.0.0-dev",
+ "format_url": "http://biom-format.org",
+ "type": "OTU table",
+ "generated_by": "QIIME revision XYZ",
+ "date": "2011-12-19T19:00:00",
+ "rows":[
+ {"id":"GG_OTU_1", "metadata":null},
+ {"id":"GG_OTU_2", "metadata":null},
+ {"id":"GG_OTU_3", "metadata":null},
+ {"id":"GG_OTU_4", "metadata":null},
+ {"id":"GG_OTU_5", "metadata":null}
+ ],
+ "columns": [
+ {"id":"Sample1", "metadata":null},
+ {"id":"Sample2", "metadata":null},
+ {"id":"Sample3", "metadata":null},
+ {"id":"Sample4", "metadata":null},
+ {"id":"Sample5", "metadata":null},
+ {"id":"Sample6", "metadata":null}
+ ],
+ "matrix_type": "sparse",
+ "matrix_element_type": "int",
+ "shape": [5, 6],
+ "data":[[0,2,1],
+ [1,0,5],
+ [1,1,1],
+ [1,3,2],
+ [1,4,3],
+ [1,5,1],
+ [2,2,1],
+ [2,3,4],
+ [2,5,2],
+ [3,0,2],
+ [3,1,1],
+ [3,2,1],
+ [3,5,1],
+ [4,1,1],
+ [4,2,1]
+ ]
+ }
+
diff --git a/inst/extdata/mothur_example.cons.taxonomy.gz b/inst/extdata/mothur_example.cons.taxonomy.gz
new file mode 100644
index 0000000..761f8e9
Binary files /dev/null and b/inst/extdata/mothur_example.cons.taxonomy.gz differ
diff --git a/inst/extdata/qiime500-refseq.fasta b/inst/extdata/qiime500-refseq.fasta
new file mode 100644
index 0000000..3f38f3c
--- /dev/null
+++ b/inst/extdata/qiime500-refseq.fasta
@@ -0,0 +1,3070 @@
+>153762
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGTGGATTGTTAAGTCAGTTGTGAAAGTTT
+GCGGCTCAACCGTAAAATTGCAGTTGAAACTGGCAGTCTTGAGTACAGTAGAGGTGGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGACTTAGTCGAAGGA
+>175045
+TACGTAGGGGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGTAGGCGGGACGGCAAGTCAGATGTGAAATACC
+GAGGCTCAACTTCGGGGCTGCATTTGAAACTGTCGTTCTTGAGTGATGGAGAGGCAGGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCCTGCTGGACATTAACTGACGCTGAGGCGCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGGTGGGGGACTCGACCCCTCCG
+TGCCGGAGTTAACACAATAAGTATTCCACCGTGGGGAGTACGGCCGCAAGGTTGAAAC
+>71074
+TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGTTGTGTAAGTTAGTGGTCAAATGTC
+ATGGCTCAACCTTGGCTTGCCATTAAAACTGCACGACTCGAGTACAGACGAGGTAGGCGGAATAAGTTAAGTAGCGGTGA
+AATGCATAGATATAACTTAGAACTCCGATAGCGAAGGCAGCTTACCAGACTGTAACTGACGCTGATGCACGAGAGCA
+>525569
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGTGCGTAGGTGGTGAGACAAGTCTGAAGTGAAAATCC
+GGGGCTTAACCCCGGAACTGCTTTGGAAACTGCCTGACTAGAGTACAGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGACTTACTGGACTGCTACTGACACTGAGGCACGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCCGGGGCCCAAAGGGGCTTC
+GGTGCCGCAGCCAACGCAATAAGTACTTCCACCTGGGGATACGTTCGCAAGAAT
+>557121
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAAGGGAGCGTAGACGGCGAGGCAAGTCTGATGTGAAAACC
+CGGGGCTCAACCCCGTGACTGCATTGGAAACTGTTTTGCTTGAGTACCGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGT
+GAAATGCGTAGATATTAGGAGGAACATCAGTGGCGAAGGCGGCTTACTGGACTGAAACTGACACTGAGGCACGAAAGCGG
+TGGGGA
+>560734
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGTGCGTAGGTGGCAAGGCAAGTCTGAAGTGAAAAATC
+CGGGGCTCAACCCCCGGAACTGCCTTTGGAAACTGTTTAGCTGGAGTACAGGAGAGGTAAGTGGAATTCCTAGTGTAGCG
+GTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGACTTACTGGACTGCTACTGACACTGAGGCACGAAAGC
+GGTGGGGGAGCGAAAACAGGATTAGATACCCTGGTACGTCCACGCCGTAAAACGATGAATACTAGGTGTTGGGTTCCAAA
+GGAACTCGGTGCCGTCGCAAACGCATTAAGTATTCCACCTGGGGAGTACGTTCGCAAGATGAAACTCAAAGGATG
+>341901
+TACGTAGGGGGCAAGCGTTATCCGGAATTACTGGGTGTAAAGGGTGCGTAGGTGGCATGGTAAGTCAGAAGTGAAAGCCC
+GGGGCTTAACCCCGGGACTGCTTTTGAAACTGTCATGCTGGAGTGCAGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACTGTCACTGACACTGATGCACGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGGCCGTAGAGGCTTCGG
+TGCCGCAGCAAACGCAGTAAGTATTCCACCGTGGGGA
+>286030
+TACGTAGGTGGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGTAGGCGGGCAGGCAAGTCAGGCGTGAAATATA
+TCGGCTCAACCGGTAACGGCGCTTGAAACTGCAGGTCTTGAGTGAAGTAGAGGTTGGCGGAATTCCTAGTGTAGCGGTGA
+AATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCCAACTGGGCTTTTACTGACGCTGAGGCTCGAAAGTGTGG
+GGAGCAAACAGGATTAGATACCCTGGTAGTCCACACAGTAAACGATGAATACTCGCTGTTTGCGATATACAGTAAGCGGC
+CAAGCGAAAGCGTTAAGTATTCCACCTGGGGAGTACGCCGGCAACGGTGAAACTCAAAGGAATTGACGGGGGCCCGCACA
+AGCGGAGGAACATGTGGTTTAATTCGATGATACGCGAGGAACCCTTACCGGGCTTGAATTGCAACTGAATGATGTGGAGA
+CATGTCAGCCGCAAGGC
+>275402
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCGGACGCTTAAGTCAGTTGTGAAAGTTT
+GCGGCTCAACCGTAAAATTGCAGTTGATACTGGGTGTCTTGAGTACAGTAGAGGCAGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCCTGCTGGACTGTAACTGACGCTGATGCTCGAAAGTGTG
+GGTATCAAACAGGATTAGATACCCTGGTAGTCCACACAGTAAACGATGAATACTCGCTGTTTGCGATATACAGTAAGCGG
+CCAAGCGAAAGCGTTAAGTATTCCACCTGGGGAGTACGCCGGCAACGGTGAAACTCAAAGGATTGACGGGGCCCGCGACA
+AGCGGAGGAACAGTGTGGTTTAATTCGATGATACGGCGAGGAACCCTTACCCGGGCTTTAGGAACTTGCAACTGAATTGA
+TGTGGA
+>321069
+TACGTAGGTGGCAAGCGTTATCCGGAATTACTGGGCGTAAAGAGTGAGCAGGCGGTTAGATAAGTCTATGGTTAAATGCA
+ACAGCCCCAATGTTGTTCGCTATAGAAACTTATTTAACTAGAGTGCGGGAGAGGTAAGTGGAACTCCATATGTAGCGGTG
+GAATGCTTAGATATAGTAGGAA
+>161544
+TACGTAGGTGGCAAGCGTTGTCCGGAATTTACTGGGTGTAAAGGGAGCGTAGGCGGGCAGGCAAGTCAAGGCGTGAAATA
+TATCGGACTCAACCGGTAACGCGCTTTGAAACTGCAGGTCTTGAGTGAAGTAGAGGTTGGCGGAATTCCTAGTGTAGCGG
+TGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCCAATTGGGCTTTTACTGACGCTGA
+>215003
+TACCTCCGTATGCGTCGATGTGGTTATCCTTGAGCATTGCCGCGTCTGTAAGGTTGTAGCGGTGGTTGCGGCCTCCGCCG
+ACCGTTACCGCGTACTTCTGGAGAGGGCGCAGTCCCGGGAGAGTCTTGCGGGTATCCGCAATGCTGGCGTTGGTTCCCTT
+CACAAGCTCGACGCACTTGTTGGTGTAGCTCGAAATTCCGCTCATGTGCTGGAGCAGATTCAGAGATGTGCGCTCAGCTT
+TCAGCATGAGGCGGGTGTGTCCGGTGAATTCCGCGATGATATCGCCGTACTTCACCTTGTCGCCGTCCTTGAAGCGGCAC
+TCGATCTTCATTTCCGGGTCGAGTATCGTGAATACCCGCAGCGCTACTTCGAGTC
+>548077
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCGAGACAAGTCTGAAGTGAAAGCCC
+GGGGCTCAACCCCGGGACTGCTTTGGAAACTGCCTTGCTAGAGTGCTGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACAGTAACTGACGTTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCGGTAAACGATGGATGCTCGCTGTTTGCGTCTGACGTAAGCGGC
+CAAGCGAAAGCGTTAAGCATCCCACCTGGGGAGTACGCCGGCAACGGTGAAACTCAAAGGAATTGACGGGGCCCGCACAA
+GCGGAGGAACATGTGGTTTAATTCGATGATACGCGAGGAACCTTACCCGGGCTTGAACTGCAGGCGAACGATTCAGAGAT
+GATGAGGCCCTTCGGGCGCCTGTGGAGGTGCTGCA
+>257176
+TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGCAGCCGGGAATGCAAGTCAGATGTGAAATCCA
+TGGGCTTAACCCATGAACTGCATTTGAAACTGTATTTCTTGAGTACTGGAGAGGCAATCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGATTGCTGGACAGCAACTGACGGTGAGGCGCGAAAGTGTG
+GGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACACTGTAAACGATGAATACTAGGTGTGCGGGACTGACCCCCTGCG
+TGCCGCAGTTAACACAATAAGTA
+>100157
+AACGTAGGGTGGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAAACC
+ATGGGACTCGAACCCATACGAATTGCTTTCAAAACTGTTTTCTGAGTAGTGCAGAGGTAGATGGAATTCCCGGTGTAGCG
+GTGGAATGCGTAGATATCGGGAGGAACACC
+>576085
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGTGGATTGTTAAGTCAGTTGTGAAAGTTT
+GCGGCTCAACCGTAAAATTGCAGTTGAAACTGGCAGTCTTGAGTTACAGTAGAGGTGGGCGGAATTCGGTGGTGTTAGCG
+GTGAAATGGCTTAAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTCACTAGACT
+>100757
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGCGGTACGGCAAGTCTGATGTGAAATCCC
+GGGGCTCAACCCCGGTACTGCATTGGAAACTGTCGGACTAGAGTGTCGGAGGGGTAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGATTACTGACGCTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGGAGCATTGCTCTTCGG
+TGCCGCAGCAAACGCAATAAGTATTCCACCTGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGGATCCGG
+CACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCCTTACCAAGTCTTAGACATCCCACTGACCAAGT
+ATAGTAATGTTACTTTC
+>91919
+TACGTAGGTGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGTGTGTAGGCGGGACTGCAAGTCAGATGTGAAAATCA
+TGGGCTCAACTCATGACCTGCATTTGAAACTGTGGTTCTTGAGAGTCGGAGGGGTAATCGGAATTCCCGGTGTAGCGGTG
+AAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGATTACTGGACGACCACTGACGCTGAGACACGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATGAATGCTAGGTGTGGGTGCGATAGCATCCGTGC
+CGCAGTTAACACAATAAGCATTCCACCTGGGGATACGGCCGCAAGGTTGAAACTCAAAGGAATTGACGGGACCCGCACAA
+GCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAAGTCTTGACATCCCCCTGACAGAGTATGTAAT
+GTACTTTTCCTTCGGG
+>89337
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCTGTGTAAGTCTGATGTGAAAGATC
+GGGGCTCAACCCCGGGCCTGCATTGGAAACTGTGCAGCTAGAGTGTCGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGATGACTGACGTTGAGGCTCGAAAGCGTG
+GGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGACTGCTAGGTGTCGGGAGGCAAAGCCTTTCGGT
+GCCGCAGCAAACGCAATAAGCAGTCCACCTGGGGAGTACGTTCGCAAGAATGAACTCAAAGGATTGACGGGGACCCGCAC
+AAGCGGTGGAGCATGTGGTTAATTCGAAGCAACGCGAAGACCTTACTGCCTCGACATCCGGATGACGCAGGT
+>542118
+TACGTATGGAGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGTGTAGGTGGCCAGGCAAGTCAGAAGTGAAAGCCC
+GGGGCTCAACCCCGGGACTGCTTTTGAAACTGCAGGGCTAGAGTGCAGGAGGGGCAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTGCTGGACTGTAACTGACACTGAGGCTCGAAAGCGTG
+GGAGCAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGTGTCGGGGCACATAAGTGCTCCGGTG
+CCGCAGCAAACGCAATAAGTATTCCACCTGGGGAGTACGTTCGCAAGATGAACTCAAAGGATTGACGGGGACCGCACAAG
+CGGTGGAGCATGTGTTAATTCGAAGCAACGCG
+>265094
+TACGGAGGATGCGAGCGTTATCCGGTATTTATTGGGTTTAAAGGGTGCGTAGGCGGACTGTCAAGTCAGACGGTAAAAGT
+ACGGGGCTCAACCTCCGCCCGCCGTTGAAACTGACGGTCTTGAGTGGGCGAGAAGTATGCGGAATGCGTGGTGTAGCGGT
+AGAAATGCATAGATA
+>128051
+TACGTAGGGAGCAAGCGTTATCCGGATTTTATTGGGGTGTAAAGGGTGCGTAGACGGGAAATTAAGTTAGTTGTGAAATC
+CCTCGGCTTAACTGAAGGAACTGCAACTAAAACTGATTTTCTTGAGTGCTGGAGAGGAAAGTGGAATTCCTAGTGTAGCG
+GTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGACTTCTGGACAGTAACTGACGTTGAGGCACGAAAGTG
+T
+>23870
+GGACCACCGTACCAACTTCCAGACCAGTGATGTGAATGGTGTGATGGATGGAAAGATAGACGGCTTTATCAAAGCTTACC
+TGATGGAGTTTGCAGGAAGTGGCGAATAAATTTTCTCCCTTTCATTTGCATTCTTCCCGAATTTCTTCTTACTTTGTTCA
+TCGTTGAACTTTAAACTATAAAACACACCATGGGAAAAAGTAAAAGAAAGCAATGCACAGCCAGAAGGAAGAGCAACAAG
+CCAAGAAAGTGATGATGACGATGGGAGTAATAGCCATTGTATTGGTCATCGCCATGTTTGTGGCTTACTCTTACTGGGGC
+TGATAAAATCACTGGAAACGACATGCCGCAGGAGATAGAACGGAAATTCCTCGTGACGGGAGAATACAAGCCACAAGCGT
+A
+>160135
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCGATGCAAGTCTGAAGTGAAATACC
+CGGGCTCAACCTGGGAACTGCTTTGGAAACTGTATGGCTAGAGTGCTGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAAGAACACCAGTGGCGAAGGCGGCTTACTGGACAGTAACTGACGTTCAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGGGCAAAGTCTTTCGGT
+GCCGCCGCAAACGCAATAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGGACCCGC
+ACAAGCGGTGGAGGCATGTGGTTAATTCGAAGCAACGCGAAGAACCTTACCAAATCTTGACATCCCTCTGAAACACTCTT
+AAATCGAGTGCCTCCTTCGGGACAGAGGAGACAGGTGGTGCAGTGG
+>574226
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGTTTTGCAAGTCTGAAGTGAAAGCCC
+GGGGCTTAACCCCGGGACTGCTTTGGAAACTGTAGGACTAGAGTGCAGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACTGTAACTGACGTTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGGCTCATAAGAGCTTCG
+GTGCCGCAGCAAACGCAATAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGATTGACGGGGACCGC
+ACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAAACGCGAAGAACCTTACCAGGTACTTGACATCCTCCTGAACGGAAG
+GTAATGCTTCCGGTCCTTCGGGACAGGAGAGAC
+>329744
+TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGCAGACGGGAGATTAAGTCAGTTGTGAAAGTTT
+GCGGCTCAACCGTAAAATTGCAGTTGATACTGGTTTCCTTGAGTGCAGTTGAGGCAGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTCACTGGAGCGCAACTGACGCTGAAGCTCGAAAGTGCG
+GGTATCGAACAGGATTAGATACCCTGGTAGTCCGCACGGTAAACGATGGATGCCCGCTGTTGGTCTGAATAGGGTCAGCG
+GCCAAGCGAAAGCATTAAGCAT
+>160337
+TACGTAGGTGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGTGTGTAGGCGGGACGACAAGTCAGATGTGAAACTCA
+TGGGCTCAACCCATGACCTGCATTTGAAACTGCCGTTCTTGAGAGTCAGAGAGGTAAATGGAATTCCCGGTGTAGCGGTG
+AAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGATTTACTGGACGACCACTGACGCTGAGACACGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATGCTAGGTGTAGGGGCATTAAGCTTCTGTG
+CCGCAGTTAACACAATAAGCATTCCACCTGGGGAGTACGGCCGCAAGGTTGAAACTCAAAGGAATTGACGGGAGCCCGCA
+CAAGCAGTGGAGTATGTGGTTTATTCGACGCAACGCGAAGACCTTACAGTCTGCATCCTGAGAACCCGGCG
+>104310
+TGTTTCCCGATGATGGCGTCCGGACGCTGAGAGGGGAGGAGCTGCGGTTCTCCTACCGTCACAGCCTGCTGACAGACCGG
+CCGGAGGCAGTGGTACTCCGGGTCGTGTTTCAACTGACGGCGGGAAGGCCGGAGGAGATACGGCAGAAAATGGAAGAATT
+GATGACAAGGAGAAAGGCTTCACAGCCATTGGAATATCCAAGTGCGGGCAGTACATTCAAGCGACCGGAGGGCTATTTCG
+CCGC
+>253651
+AACGTAGGTCACAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGCGATCAAGTTGGAAGTGAAATCCA
+TGGGCTCAACCCATGAACTGCTTTCAAAACTGATTGTCTTGAGTAGTGCAGAGGTAGGCGGAATTCCCGGTGTAGCGGTG
+GAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGCCTACTGGGCACCAACTGACGCTGGAGCTACGAAGTGTG
+GTAGACAACGAGGTATTAGACTACCCGTGGTACGT
+>76142
+TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGCAGACGGGAGATTAAGTCAGTTGTGAAAGTTT
+GCGGCTCAAACCGTAAAAATTGCAGTTGATACTGGTTTCCTTGAGTGCAGTTGAGGCAGGCGGAATTCGTGGTGTAGCGG
+TGGAATGCTTAG
+>144921
+CAGCGCATTCAAGGACGGAAGGATGCGTCATCAATGCGCTTTCGACCTCAAACGGCCCTATACGATAACCGGAGCATTTG
+ATAACGTCGTCACTTCTTCCGACAAACCAATAATATCCGTCAGAATCGCGCCACGCCGTATCACCTGTGTCATAACAACC
+GCATCGAAACGCAGACGCATTAGCATCGACGTCACCGA
+>349277
+TACGTATGGAGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGTGTAGGTGGCCATGCAAGTCAGAAGTGAAAATCC
+GGGGCTCAACCCCGGAACTGCTTTTGAAACTGTGAGGCTGGAGTGCAGGAGGGGTGAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTCACTGGACTGTAACTGACACTGAGGCTCGAAAGCGTG
+GGAGCAACAGGATTAGATACCCTAGTAGTCCACGCCGTAAACGATGAGTACTAAGTGTTGGGAGTCAAATCTCAGTGCTG
+CAGTTAACGCAATAAGTACTCCGCCTGAGTAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGGCCCGCACAAG
+CGGTGGAGCATGTGGTTAATTCGAAGCAACGCGAAGAACCTTACCAGGTCTTGACATCGATCTAAAGGCTCCAGAGATGG
+AGAGATAGCT
+>152720
+TACGTAGGTGGCAAGCGTTATCCGGAATTACTGGGTGTAAAGGGTGTGTAGGCGGGGAGGCAAGTCAGATGTGAAAATTA
+TGGGCTTAACCCATAACCTGCATTTGAAACTGTTTTTCTTGAGAGGCGGAGAGGTAAACGGAATTCCCAGTGTAGCGGTG
+AAATGCGTAGATATTGGGAGGAACACCAGTGGCGAAGGCGGTTTACTGGACGTCAACTGACGCTGAGACACGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATGCTAGGTGTAGGGCAGGATACTGTTCTGT
+GCCGCAGTTAACACAATAAGCATTCCACCTGGGAGTACGGCCGCAAGGTTGAAACTCAAAGGAATTGACGGGA
+>218035
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCAAGGCAAGTCTGGGATGGTGAAAG
+GCTGGGGCTCAACCCCGGGACTGCATTGGAAACTGTCCTGCTAGAGTGCCGGAGAGGTAAGCGGAATTCCTAGTGTAGCG
+GTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGGTAACTGACGTTGAGGCTCGAAAGC
+GTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCGGTAAACGATGGATACTAGGTGTTGGGTGTCACAGACATT
+CGGTGCCGCAGCAAACGCAATAAGTATTCCACCAGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGGAC
+CCGCACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAAGTCTTGACATCCCCCTGACAGAG
+TATGTAATGTACTTTTCTTCGGGACAGGGGAGACAGGTGGTGC
+>29284
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCGGACGCTTAAGTCAGTTGTGAAAGTTT
+GCGGCTCAACCGTAAAATTGCAGTTGATACTGGGTGTCTTGAGTACAGTAGAGGCAGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAGAACTCCGATTGCGAAGGCAGCTTGCTGGACTGTAACTGACGCT
+>588981
+TACGTAGGTTGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGTAGGCGGAGATGCAAGTTGGGAAGTTGAAATC
+CATGGGTCTCAACCCATGAACTGCTCTCAAAACTTGTATCCCTTGAGTATCGGAGAGGCAAGCGGAATTCCTAGTGTAGC
+GGTGAAATGCGTAGATAGTTA
+>12364
+TACGTATGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGGCGGCATGGCAAGTCAGAAGTGAAAGCCT
+GGGGCTCAACCCCGGAATTGCTTTTGAAACTGTCAGGCTAGAGTGTCGGAGGGTAAGCGGAATTCCTAGTGTAGCGGTGA
+AATGCGTAGATATTAGGAGGAACACCGGTGGCGAAGGCGGCTTACTGGACGATTACTGACGCTGAGGCTCGAAAGCGTGG
+GGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGAACAATAGTTCTTCGGT
+GCCGCAGCTAACGCATTAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGATTGACGGGGACCCGCA
+CAAGCGTGGAGCATGTAGGTTTATTCGACGCAACGCGAAGAACCTTACTGCTCTGACATCCCACTGACCGGTCGTAATGC
+GTACCTTTTCTTACGGAACTAGTGGACGACAGGTCGGTGC
+>139920
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCATCACAAGTCAGAAGTGAAAATCC
+GGGGCTCAACCCCGGAACTGCTTTTGAAACTGTGGAGCTGGAGTGCAGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAAGAACACCAGTGGCGAAGGCGGCTTACTGGACAGTAACTGACGTTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTTGGTGAGCAAAGCTCATCGG
+TGCCGCCGCAAACGCAATAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGATTGACGGGGACCCGC
+ACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTAACCAAATCTTGACATCCCTCTAGAAAAGTCC
+TTTAATCGGGCTCCTCCTTCGGGACAGAGGT
+>26061
+TACGTAGGGAGCGAGCGTTGTCCGGAATTACTGGGCGTAAAGGGCGCGTAGGCGGGGACTTAAGTTAGGTGTGAAAACTT
+CGGGCTTAACCTGAAGACTGCACTTAAAACTGGGTTTCTTGAGGGCAGGAGAGGAAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGACTGTACCTGACGCTGAGGCGCGAAAGCATG
+GGGAGCGAACAGGATTAGATACCCTGGTAGTCCATGCCGTAAACGATGAATACTAGGTGTAGGAGGTATCGACCCCTTCT
+GTGCCGCAGCTAACGCAATAAGTATTCCGCCTGGGGAGTACGGCCGCAAGGTTGAAACTCAAAGGAATTGACGGGGACCC
+GCACAAGCGGTGGAGCATGTGGTTTAATTCGACGCTACGCGAAGAACCTTACCAGGGCTTGACATCCCGCGCTATCCCAG
+GAAACTGGGAGTTCCGAGCTTCGGTTTGGACGCGGTGACAGGTGGTGCAT
+>471141
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGATGGATGTTTAAGTCAGTTGTGAAAGTTT
+GCGGCTCAACCGTAAAATTGCAGTTGATACTGGATGTCTTGAGTGCAGTTGAGGCAGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCCTGCTAAGCTGCAACTGACATTGAGGCTCGAAAGTGTG
+GGTATCAAACAGGATTAGATACCCTGGTAGTCCACACGGTAAACGATGAATACTCGCTGTTTGCGATATACGGCAAGCGG
+CCAAGTGAAAGCGTTAAGACTTCCACCGTGGGGATACGCCGGCAACGGTGAACTCAAAGGAATTGGACGGGGCCGCACAG
+CGGAGACATGTGGTTAATCGATGATACG
+>261709
+AGGGCGCAGACCTTTCTGACAGGATCATAGAAGAAAGAGGCGTGCAGTCTGCATAATTCATGGTAAGCGGCACATCAGAA
+GAAGGAGTAATGAGACCGTCCGTATCGGGAATAGCCGTATCAAGGGAAAAGCGCCACGGCCCCGGTCCTCTTCTGAAACC
+TCCTCGATATGAGTACCATGTACCTGCCGGCAGTTGATAAGCAGCCCTTCATCCAAACCCAGGACCCGGCAGTACCGCTT
+CCGGTTCTCCAGCACATCCTCTACCCGGTCCCCACATGAAGCCCCATGTTCAAACTGGTAAAAGGAGGTC
+>320879
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCGGACTATTAAGTCAGCTGTGAAAGTTT
+GCGGCTCAACCGTAAAATTGCAGTTGATACTGGTCGTCTTGAGTGCAGTAGAGGTAGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTTACTGGACTGTAACTGACGCTGATGCTCGAAAGTGTG
+GGTATCAAACAGGATTAGATACCCTGGTAGTCCACACAGTAAACGATGAATACTCGCTGTTTGCGATATACAGCAAGCGG
+CCAAGCGAAAGCATTAAGTATTCCACCTGGGAGTACGCCGGCAACGGTGAAACTCAAAGGAATTGACGGGGCCGCACAAG
+CGGAGGAACATGTGGTTTAATTCGATGATACGCGAGGAACCTTACCCGGGCTTAAAGTTGACAAATGAATATAGT
+>503327
+TACGTAGGGAGCAAGCGTTATCCGGATTTATTGGGTGTAAAGGGTGCGTAGACGGGACAACAAGTTAGTTGTGAAATCCC
+TCGGCTTAACTGAGGAACTGCAACTAAAACTATTATTCTTGAGTGTTGGAGAGGAAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGAACACCGGTGGCGAAGGCGACTTTTCTGGACAATGACTGACGTTGAGGCAACG
+>261774
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGCCTTTTAAGTCAGCGGTGAAAGTCT
+GTGGCTCAACCATAGAATTGCCGTTGAAACTGGGGGGCTTTGAGTATGTTTGAGGCAGGCGGAATGCGTGGTGTAGCGGT
+GAAATGCATAGATATCACGCAGAACCCGACTTGCGAAGGCAGCCTGCCAAGCCATTACTGACGCTGATGCACGAAAGACG
+TGGGGAGTCAAACCAGGATTAGATACCCTGGTAGTCCACGCAGTAAACGATGATCACTACGCTGTTTGCGATACATTGT
+>13333
+TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGTAGCCGGAGAGACAAGTCAGATGTGAAATCCG
+CGGGCTCAACTCGCGAACTGCATTTGAAACTGTTTCCCTTGAGTATCGGAGAGGTAACCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAAGAACACCAGTGGCGAAGGCGGGTTACTGGACGACAACTGACGGTGAGGCGCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATCAATACTAGGTGTGCGGGGACTGGACCCCTGCC
+GTGCCGCAGTTAACACAATAAGTATTGCACCTGGGGAGTACGATCGCAAGGTTGAAACTCAAAGGAATTGACGGGGGCCC
+GCACAAGCGGTGGATTAATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAGGCTTGACATCCTACTAACGAAGTA
+GA
+>13069
+TACGTAGGGAGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGTGTAGGCGGGTCTCCAAGTCCGTTGTCAAATCTA
+TCGGCTCAACCGATAGCCCGCGGCGGAAACTGGAGGTCTTGAGTGAAGTAGAGGCAGGCGAATTCCTAGTGTAGCGGTAG
+AAATGCGTAGATATTAGGA
+>203691
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGATGGATGTTTACGTCAGTTGTGAAAGTTT
+GCGGCTCAACCGTAAAATTGCAGTTGATACTGGATATCTTGAGTGCAGTTGAGGCAGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTCACTAGACTGTTACTGACACTGATGCTCGAAAGTGTG
+GGTATCAAACAGGATTAGATACCCTGGTAGTCCACACAGTAAACGATGAATACTCGCTGTTTGCGATATACAGTAAGCGG
+CCAAGCGAAAGCATTAAGTATTCCCACCTGGGGATACGCCGGCAACGGTGAAACTCAAAGGAATTGACGGGGGCCCGCAC
+AAGCGGAGGAACATGTGGTTTAATTCGATGATACGCGAGGAACCTTA
+>292521
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGATGGCATGGCAAGTCTGAAGTGAAAGCCC
+GGGGCTTAACCCCGGGACTGCTTTGGAAACTGTTAAGCTAGAGTGCAGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCGGTGGCGAAGGCGGCTTACTGGACTGTAACTGACATTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTAGGGGTTGTCATGACCTCTG
+TGCCGCCGCTAACGCATTAAGTATTCCGCCTGGGGATACGGTCGCAAGATTAAAACTCAAAGGAATTGACGGGGGCCCGC
+ACAAGCAGCGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCTAGACTTGACATCTCCTGCATTACTCTTA
+ATCGAGGAA
+>201720
+TACGGTAGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGTACGGCATCACAAGTTAGAAAGTGGAAAA
+TCCCGGGGCTCAACCCCGGAACTGCTTTTGAAACTGTGGAGCTGGAGTGCAGGAGAGGTAAGCGGAATTCCTAGTGTAGC
+GGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACTGTAACTGACGTTGAGGCTCGAAAG
+CGTGGGGAGCAAACAGGATTAGATACCCTGGTAAGTCCACGCCGTAAACGATGAATAC
+>72610
+TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGTAGGCGGAGATGCAAGTCAGAAGTGAAATCCA
+TGGGCTTAACCCATGAAACTACGTTTTGTAAACTGTATCCCTTGAGTATCGGAGAGGCAGGCGGAATTCCTAGTGTAGCG
+GTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCCTGCTGGACGACAAC
+>90465
+TACGTAGGGGGCGAGCGTTATCCGGAATTATTGGGCGTAAAGAGTGCGTAGGTGGCACCTTAAGCGCAGGGTTTAAGGCA
+ATGGCTCAACCATTGTTCGCCTTGCGAACTGGGGTGCTTGAGTGCAGGAGGGGAAAGTGGAATTCCTAGTGTAGCGGTGA
+AATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGTACTTTCTGGACTGTTACTGACACTGAGGCACGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAAC
+>10517
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGCGGTGCGGCAAGTCTGATGTGAAAGCCC
+GGGGCTCAACCCCGGTACTGCATTGGAAACTGTCGTACTAGAGTGTCGGAGGGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGGTATTAGGTAGGGAACACCACGTGGCGAAGGACGGCGTTACTGGACGGATAACT
+>23777
+AACGTAGGTCACAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAATCCA
+TGGGCTCAACCCATGAACTGCTTTCAAAACTGTTTTTCTTGAGTAGTGCAGAGGTAGGCGGAATTCCCGGTGTAGCGGTG
+GAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGCCTACTGGGCACCAACTGACGCTGAGGCTCGAAAGTGTG
+GGTAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGGATACTAGGTGTAGGAGGTATCGACCCCTCCT
+GTGCCGGAGTTAACGCAATAAGTATCCCGCCTGGGAAGTACGATCGCAAGATTAAAACTCAAAGGAATTGACGGGGCCCG
+CACAAGCGGTGGAGTATGTGGTTTAATTCGACGCAACGCGAAGAACCTTACCAGGTCTT
+>576850
+TACGGAAGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCGGGCTTTTTAAGTCAGCGGTCAAAATG
+TCGTGGCTCAACCATGTCAAGCCGTTGAAACTGTAAGCCTTGAGTCTGCACAGGGCACATGGAAATTCGTGGTGTAGCGG
+TGAAATGCTTAGATATCACGAAGAACTCCGATCGCGAAGGCATTGTGCCGGGGCATAACTGACGCTGAGGCTCGAAAAGG
+TGCGGGTAATCGAAACAGGATTACGATACCCTGGTAGTCCGACACGGT
+>531589
+TACGTAGGTGGCGAGCGTTATCCGGAATCATTGGGCGTAAAGAGGGAGCAGGCGGCCGCAAGGGTCTGTGGTGAAAGACC
+GAAGCTAAACTTCGGTAAGCCATGGAAACCGGGCGGCTGGAGTGCGGAAGAGGATCGTGGAATTCCATGTGTAGCGGTGA
+AATGCGTAGATATATGGAGGAACACCAGTGGCGAAGGCGACGGTCTGGGCCGCAACTGACGCTCATTCCCGAAAGCGTGG
+GGAGCAAATAGGATTAGATACCTTAGTAGTCCACGCCGTAAACGATGGTCACTAAGTGGTCGGGGGTACAAACCCCGGTG
+CTGCAGTCAACGCAATAGTGACCCGCCTGAGTAGTACGTTCGCAAGAAGTGAAACTACGAAAGGAATTAGGACCGGGGGG
+CCCCGACAACGAA
+>272602
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGCGGTACGGCAAGTCTGATGTGAAATCCC
+GGGGCTCAACCCCGGTACTGCATTGGAAACTGTCGGACTAGAGTGTCGGAGGGGTAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGATTACTGGCGCTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGGAGCATTGCCTTCGTG
+CCGCAGCAACGACAATAAGTAATTCCACCTCGGGGAGGTACGTTCGGTCAAGAATGAAACTACAAAGGAATTAGACGGGG
+ACCCGG
+>568172
+TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGCAGCCGGGCATGCAAGTCAGATGTGAAATCTC
+AGGGCTTAACCCTGAAACTGCATTTGAAACTGTATGTCTTGAGTGCCGGAGAGGTAATCGGAATTCCTTGTGTAGCGGTG
+AAATGCGTAGATATAAGGAAGAACACCAGTGGCGAAGGCGGATTACTGGACGGTAACTGACGGTGAGGCGCGAAAGCGTG
+GGGAGCGAACAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATGGATACTAGGTGTGCGGGGACTGACCCCCTGCG
+TGCCGCAGTTAACACAATAAGTATCCCACCTGGGGAGTACGATCGCAAGGTTGAAACTCAAAGGAATTGACGGGGGCCCG
+CACAAGCGGTGGATTATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAGGTCTTGACATCCTGCTAACGAAGTAG
+AGATACATTAGGTGCCC
+>259270
+TACGTAGGGAGCGAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGTAGGCGGGATCGCAAGTCAGATGTGAAAACTA
+TGGGCTTAACCCATAAACTGCATTTGAAACTGTGGTTCTTGAGTGAAGTAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACATCAGTGGCGAAGGCGATTTACTGGACGACAACTGACGCTGAGACACGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATGCTAGGTGTAGGGTCGGAAACGGGTCTGT
+GCCGCAGCTAACGCGATAAGCAATTCCACCTGGGGATACGGCCGCAAGGTTGAAACTCAAAGGAATT
+>42014
+TACGGAGGATCCGAGCGTTATCCGGATTTATTTGGGTTTAAAGGGAGCGTAGGCGGATTGTTAAGTCAGTTGTGAAAGTT
+TGCGGCTCAACCGTAAAATTGCAGTTGATAACTGGTCAGTCTTGAGTGCAGTAGAGGTGGGCGGAATTCGTGGTGTAGCG
+GTGAAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTCACTGGAGTGTAACTGACGCTGGATG
+>264558
+TACGTAGGTGGCGAGCGTTATCCGGAATCATTGGGCGTAAAGAGGGAGCAGGCGGCCGCAAGGGTCTGTGGTGAAAGACC
+GAAGCTAAACTTCGGTAAGCCATGGAAACCGGGCGGCTGGAGTGCGGAAGAGGATCGTGGAATTCCATGTGTAGCGGTGA
+AATGCGTAGATATATGGAGGAACACCAGTGGCGAAGGCGACGGTCTGGGCCGCAACTGACGCTCATTCCCGAAAGCGTGG
+GGAGCAAATAGGATTAGATACCCTAGTAGTCCACGCCGTAAACGATGGTCACTAAGTGTCGGGGGTCAAACCCCCGGTGC
+TGCAGTACAACGACGAATAAGTCGACCCGCCTGAGTAGTA
+>330861
+TACGTAGGGGGCAAGCGTTGTCCGGAATAATTGGGCGTAAAGGGCGCGTAGGCGGCTCGGTAAGTCTGGAGTGAAAGTCC
+TGCTTTTAAGGTGGGAATTGCTTTGGATACTGTCGGGCTTGAGTGCAGGAGAGGTAGACGGAATTCCCGGTGTAGCGGTG
+AAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGATTACTGGACGACCACTGACGCTGAGACACGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATGAATGCTAGGTGTGGGTGCGATAGCATCCGTGC
+CGCAGTTAACACAATAAGCATTCCACTGGGGATACGGCCGCAAGGTTGAAACTCAAAGGATTGACGGAGCCCGCACGAAG
+CAGTGGAGTATGTGGTTTAATCGACGCAACGCGAAGAACCTTACC
+>550329
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGCCTTTTAAGTCAGCGGTGAAAGTCT
+GTGGCTCAACCATAGAATTGCCGTTGAAACTGGGGGGCTTGAGTATGTTTGAGGCAGGCGGAATGCGTGGTGTAGCGGTG
+AAATGCATAGATATCACGCAGAACCCCGATTGCGAAGGCAGCCTGCCAAGCCATTACTGACGCTGATGCACGAAAGCGTG
+GGGATCAAACAGGATTAGATACCCTGGTAGTCCACGCAGTAAACGATGATCACTAGCTGTTTGCGATACATTGTAAGCGG
+CACAGCGAAAGCGTTAAGTGATCCACCTGGGGAGTACGCCGGCAACGGTGAAACTCAAAGGAATTGACGGGGCCCGCACA
+AGCGGAGGAACATGTGGTTTAATTCGATGATACGCGAGG
+>328963
+TACGTAGGGAGCAAGCGTTATCCGGATTTATTGGGTGTAAAGGGTGCGTAGACGGGAATACAAGTTAGTTGTGAAATCCC
+TCGGCTTAACTGAGGAACTGCAACTAAAACTATATTTCCTGAGTGCTGGAGAGGAAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGACAGTAACTGACGTTGAGGCACGAAAGTGTG
+GGGGAGCAAACAGGATTAGATACCCTGGTAAGTCCACACCGTAAACGATGGATACTAGGTGTAGGGGATGAATAATCTTC
+TGTGCCGTCGCAAACGCAGTAAGTATCCC
+>159246
+TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGTAGCCGGGAGGGCAAGTCAGATGTGAAATCCA
+CGGGCTCAACTCGTGAACTGCATTTGAAACTACTCTTCTTGAGTATCGGAGAGGCAATCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATCAGGAAGAACACCGGTGGCGAAGGCGGCTTGCTGGACTATAACTGACGCTGAGACTCGAAAGCGTG
+GGGAGCGAACAGGATTAGATACCCTGGTAGTCCACACTGTAAACGATGAATACTAGGTGTTGGGTCTCGATAGAGATTCG
+GTGCCGAAGCAAACGCATTAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTC
+>103589
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGATGGATGTTTAAGTCAGTTGTGAAAGTTT
+GCGGCTCAACCGTAAAATTGCAGTTGATACTGGATATCTTGAGCGCAGTTGAGGCAGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCCTGCTAAGCTGCAACTGACATTGAGCTCGAAAGTGTGG
+GTATCAAACAGGATTAGATACCCTGGTAGTCCACACGGTAAACATGAATACTCGCTTGTTTGCGATATACAGCAAGCGGC
+CAAGCGAAAGCGTTAAGTATTCCACCGTGGGGACGTACGGCCGGACAACGGTGAAA
+>136145
+TACGTAGGTGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGCGTAGGCGGGGATGCAAGTCAGATGTGAAATCTA
+TGGGCCTTAACCCATAAACTGCATTTGAAACTGTATCTCTTGAGTGCTGGAGAGGTAGACGGAATTCCTTGTGTAGCGGT
+GAAATGGCGTAGATATAAGGAAGAACCACCAGGTGGCGAAGGCGGTCTACTGGACAGTAACTGACGCTGAGGCGCGAGAG
+CGGTGGGGAG
+>247816
+TACGGAAGGTCCGGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGCAGGCCGTCCTTTAAGCGTGCTGTGAAATGCC
+GCGGCTCAACCGTGGCACTGCAGCGCGAACTGGAGGACTTGAGTACGCACGAGGTAGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTTACCGGAGCGCAACTGACGCTGAGGCTCGAAAGCGCG
+GGTATCGAACAGGATTAGATACCCTGGTAGTCGCGCGGTAAACGATGGATGCCCGCCGTTGGGATTTGGATTTCAGCGGC
+CAAGCGAAAGCGTTAAGCATCCCACCTGGGGAGTACGCCGGCAACGGTGAAAC
+>279083
+AACGTAGGTCACAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAATCTA
+TGGGCTCAACCCATAAACTGCTTTCAAAACTTGTTTTTCTTGAGTAGTGCAGAGGTAGGCGGAACTTCCCGGTGTAGCGG
+TGAATGCGTAGATA
+>558480
+TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGTAGCCGGGAGGGCAAGTCAGATGTGAAATCCA
+CGGGCTCAACTCGTGAACTGCATTTGAAACTACTCTTCTTGAGTATCGGAGAGGCAATCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGATTGCTGGACGACAACTGACGGTGAGGCGCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATGGATACTAGGTGTTGCGGGTATCGACCCCTGCA
+GTGCCGAAGCAAACGCGATAAGTATCCCGCCTGGGGAGTACGCACGCAAGTGTGAAACTCAAAGG
+>373357
+TACGGAGGGTGCGAGCGTTGTCCGGAATCATTGGGCGTAAAGAGTTCGTAGGTGGCCTGTTAAGTCTGGTGTTAAAATGC
+AGATGCTCAACATCTGTTCGGCACTGGATACTGGCAAGCTTGAATGCGGTAGAGGTAAAGGGAATTCCTGGTGTAGCGTG
+AATAGCGTAGATATCAGGAGGAACATCGGTGGCGTAAGCGCTTACTTGGGCCGGTAATTGACACTGAGGAACGAAAGACC
+AGGGTAAGGCAAATAGGGATTAGAGTACCCCACGTAGT
+>306610
+TATGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGCAGCCGGGAATGCAAGTCAGATGTGAAATCCA
+TGGGCTTAACCCATGAACTGCATTTGAAACTGTATTTCTTGAGTACTGGAGAGGCAATCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGATTGCTGGACAGCAACTGACGGTGAGGCGCGAAAGTGTG
+GGGAGCAGACAGGATTAGACTACCCGTGGTACGTCCACACTAGTAAACGATGAATACGTAGGTGTGGCGGGACTCGACCC
+CTGCGT
+>334326
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGTGGATTGTTAAGTCAGTTGTGAAGGTTT
+GCGGCTCAACCGTAAAATTGCAGTTGAAACTGGCAGTCTTGAGTACAGTAGAGGTGGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTCACTAGACTGTTACTGACACTGATGCTCGAAAGTGTG
+GGTATCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGGAGCATTGCTCTTCGG
+TGCCGCAGCAAACGCAATAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGGACCCG
+CACAAGCGGTGGAGCATGTGGTTTAATTCGACGCAACGCGAAGAACCTTACCAAGTCTTGACATCCTTCTGACCTAGTAT
+GTAATGTACTATCTCTTCGGAGC
+>18643
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGTGGATTGTTAAGTCAGTTGTGAAAGTTT
+GCGGCTCAACCGTAAATTGCAGTTGAACTGGGAGTTCTTGAGTACAGTAGAGGTGGGCGGAATTCGTGGTGTAGCGGTGA
+AATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTCACTAGACTTGTTACTGACAC
+>355680
+TACGTAGGGAGCAAGCGTTATCCGAATTTACTGGGTGTAAAGGGCGAGTAGGCGGATTGGCAAGTTGGGAGTGAAATGTC
+GGGGCTTAACCCCGGAACTGCTTCCAAAACTGTTGATCTTGAGTGATGGAGAGGCAGGCGGAATTCCCAGTGTAGCGGTG
+AAATGCGTAGAGATTGGGAGGAACACCAGTGGCGAAGGCGACTAACTGGACTGTAACTGACGCTGAGGCGCGAAAGTGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACACTGTAAACGATGAATGCTAGGTGTAGGGGGTATCGACCCCTTCT
+GTGCCGCAGTCAACACAATAAGCATTCCGCCTGGGGAGTACGGCCGCAAGGTTGAAACTCAAAGGATTGACGGGGCCCGC
+ACAAGCAGCGGAGCGTGTGGTTTAATTCGACGCAACGCGAAGAACCTTACCAGGTCTTGACATCCACTTAAACTTACAGA
+GATGTAAGGTGTGCTTGCACAAAGTGAG
+>547752
+TACGAAGGGGGCGAGCGTTGTTCGGAATTACTGGGCGTAAAGGGAGTGTAGGCGGTTATGTAAGATAGTGGTGAAATCCC
+AGAGCTTAACTTTGGAATTGCCATTATGACTATGTGGCTAGAATTACAGAGAGGATAGTGGAATACCCAGTGTAGAGGTG
+AAATTCGTAGATATTGGGTAGAACACCAGTGGCGAAGGCGACTATCTGGCTGTATATTGACGCTGAGGCTCGAAAGCATG
+GGGATCAAACAGGATTAGATACCCTGGTAGTCCATGCTGTAAACGATGAGTGCTTGTTGTCGGTGTAAAATCGGTGACGA
+AGTTAACGCGTTAAGCACTCCGCCTGGGGAGTACGGTCGCAAGATTAAAACTCAAAGGAATTGACGGGGACCCGCACAAG
+CGGTGGAGCATGTGGTTTAATTCGATGCAACGCGAAGAACCTTACCAACTCTTGACATCCTTATCGGGGAGATCAGAGAT
+GATTTCTTTCAG
+>166835
+TACGTAGGGGGCTAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAATCCA
+TGGGCTCAACCCATGAACTGCTTTCAAAACTGTTTTTCTTGAGTAGTGCAGAGGTAGGCGGAATTCCCGGTGTAGCGGTG
+GAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGCCTACTGGACACCAACTGACGCTGAGGCTCGAAAGTGTG
+GGTAGCAAACAGGATTAGATACCCTGGTAGTCCACACTGTAAACGATGATTACTAGGTGTTGGAGGATTGACCCTTCAGT
+GCCGCAGTTAACACAATAAGTAATCCACCTAGGGAGTACGACCGCAAGGTTGAAACTCAAAGGAATTGACGGGGCCCGCA
+CAAGCAGTGGAGTATGTGGTTTAATTCGACGCAACGCGAAGAACCTTACCAAGTCTTGACATCCCTTGACAATGCTGGAA
+ACAGTATTTCTCTTCGGAGCAAAGGAGACGAGGTGGTGCA
+>295422
+TACGTAGGTGGCGAGCGTTATCCGGAATTACTGGGTGTAAAGGGTGTGTAGGCGGGACGACAAGTCAGATGTGAAAATTG
+CAGGCTCAACCTGGAAAGTGCATTTGAAACTGCCGTTCTTGAGAGTCGGAGAGGTAAATGGAATTCCCGGTGTAGCGGTG
+AAATGCGTAGATAGTCGGGAGGAACCACCAGTGGCGAAGGCGTATTTACTGGACGACAACTGACGCTGAGACACGAAAGC
+GTGGGGAGCAAACAGGATTAGACTACCCGTGGTACGTCCACGCTAGTAAACG
+>234044
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGATGGATGTTTAAGTCAGTTGTGAAAGTTT
+GCGGCTCAACCGTAAATTGCAGTTGATACTGGATATCTTGAGTGCAGTTGAGGCAGGCGGAATTCGTGGTGTAGCGGTGA
+AATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCCTAGCCTAAGCCGTCACTA
+>207982
+TACGTAGGGTGGCGAGCGTTTATCCGGGTAAATCATTGGGCGTAAAGAGGGAGCAGGCGGCCGCAAGGGGTCTGTGGTGA
+AAGACCGAAGCTAAACTTCGGTAAGCCATGGAAACCGGGCGGCTTGGAGTGCGGAAGAGGATCGTGGAATTCCATGTGTA
+GCGGTGAAATGCGTAGATATATGGAGGAACACCAGTGGCGAAGGCGACGATCTGGGCCGCAACTGACGCTCATT
+>470724
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGTGTGGCAAGTCTGATGTGAAAGGCA
+TGGGCTCAACCTGTGGACTGCATTGGAAACTGTCATACTTGAGTGCCGGAGGGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGGTAACTGACGTTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGTGGCAAAGCCATTCGG
+TGCCGTCGCAGACGCAGTAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGGACCCG
+CACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAAGTCTTGACATCCCTCTGACCGGAACT
+TTAACCGTTCCTTTCCTCGGG
+>394191
+TACGTAGGGAGCGAGCGTTATCCGGATTTATTGGGCGTAAAGCGTGCGTAGGCGGTTTATTAAGTTTAAGATAAAAGCCC
+GGGGCTCAACTCCGGTTCGTCTTAAAAACTGGTAGACTTGAGTGTGGTAGAGGTAAATGGAATTTCTAGTGTAGCGGTGA
+AATGCGTAGATATTAGAAGGAGCACCAGTGGCGAAGGCGATTTACTGGGCCATAACTGACGCTGAGGCACGAAAGCGTGG
+GGAGCAAATAGGATTAGATACCCTAGTAGTCCACGCCGTAAACGATGAGTACTAAGTGTTGGAGATTCCAGTGCTGAAGC
+TAACGTATTAAGTACTCCGCCTGAGTAGTACGGTCGCAAGGCTGAAACTCAAAGGAATTGACGGGCACCCGCACAAGCGG
+TGGAGCATGCTGTTTAATTCGAAGATACGCGAAGAACCTTACCTAGACTTGACATCCCCCGGCAAAGACATGGAAACATG
+TTTGGAGGTTTACCCGGGAGACGAGGTGGTGCATGGTTGT
+>151439
+TACGGAAGGTCCGGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCCGGAGATTAAGCGTGTTGTGAAATGTA
+GTGGCTCAACCTCTGCACTGCAGCGCGAACTGGTCTTCTTGAGTACGCACAACGTGGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTCGACGGGAGCGCAACTGACGCTGAAGCTCGAAAGTGA
+CGGGTATCGAACAGGACTTTAGATACCCCTGGCCAAGTCCG
+>101544
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGCGGTGCGGCAAGTCTGATGTGAAAGCCC
+GGGGCTCAACCGCGGGACTGCATTGGAAACTGTCGTACTAGAGTGTCGGAGAGGTTAGTGGAATTCCCAGTGTAGCGGTG
+AAATGCGTAGAGATTGGGAGGAACACCAGTGGCGAAGGCGACTAACTGGACTGTAACTGACGCTGAGGCGCGAAAGTGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACACTGTAAACG
+>356589
+TACGGAAGGTTCGGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCCGGAGATTAAGCGTGTTGTGAAATGTA
+GATGCTCAACATCTGAACTGCAGCGCGAACTGGTTTCCTTGAGTACGCACAAAGTGGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTCACTGGAGCGCAACTGACGCTGAAGCTCGAAAGTGCG
+GGTATCGAACAGGATTAGATACCCTGGTAGTCCACGCGGTAAACGATGGATGCTCGCTGTTTGCGTCTGACGTAAGCGGC
+CAAGCGAAAGCGTTAAGCATCCCACCTGGGGAGTACGCCGGCAACGGTGAAACTCAAAGGAATTGACGGGGGCCGCGACA
+AGCGGAGGAACATGTGGTTAATTCGATGATACGCGAGGAACCTTACCGGCTTGAACTGCAGGCGAACGAATCAGAGATGA
+TGAGGCCC
+>107184
+TACGTAGGTGGCGAGCGTTATCCGGAATCATTGGGCGTAAAGAGGGAGCAGGCGGCCGCAAGGGTCTGTGGTGAAAGACC
+GAAGCTAAACTTCGGTAAGCCATGGAAACCGGGCGGCTAGAGTGCGGAAGAGGATCGTGGAATTCCATGTGTAGCGGTGA
+AATGCGTAGATATATGGAGGAACACCAGTGGCGAAGGCGACGGTCTGGGCCGGCAACTGACGCTCATTCCCGAAACGACG
+TGGGGAGGCAAATAAGGATTAGATACCTACGTAGTCACGCCGTAAACG
+>163176
+ATACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCGGGTTGGTTAAGTCAGTTGTGAAAGT
+TTAGCGGCTCAACCGTAAAATTGCAGTTGATACTTGGACGACCTTTGAGTGCAACAGAGGTAGGCGGAATTCGTGGTGTA
+GCGGTGAAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTTACTGGATTGTAACTGACGCTGATAGCTCGG
+AAAGTGT
+>6697
+TACGTAGGTGGCGAGCGTTATCCGGAATCATTGGGCGTAAAGAGGGAGCAGGCGGCCGCAAGGGTCTGTGGTAGAAAGAC
+CGGAAGCCTAAACTTCGGTAAGCCATGGAAACCGGGCGGCTGGAGTGCGGAAGAGGATCGTGGAATTCCATGTGTAGCGG
+TGAAATGCGTAGATATATGGAGGAACACCAGTGGCGAAGGCGACGGTCTGGGCCGCAACTGACGCTCATTCCCGAAAGCG
+TGGGGAGCAAATAGGATTAGATACCCTAGTAGTCCACGCCGTAAACGATGGTCACTAAGTGTCGGGGTCAAACCCCGGTG
+CTGCAGTCAACGCAATAAGTGACCCGCCCTGGAGTAGTACGTTCGACTAAGAAAT
+>556231
+TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGCAGACGGGAGATTAAGTCAGTTGTGAAAGTTT
+GCGGCTCAACCGTAAAATTGCAGTTGATACTGGTTTCCTTGAGTGCAGTTGAGGCAGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTCACTAGACTGTCACTGACACTGATGCTCGAAAGTGTG
+GGTATCAAACAGGATTAGATACCCTGGTAGTCCACACAGTAAACGATGAATACTCGCTGTTTGCGATATACAGTAAGCGG
+CCAAGCGAAAGCATTAAGTATTCCACCTGGGGAGTACGCCGGCAACGGTGAAACTCAAAGGAATTGACGGGGCCCGCACA
+AGCGGAGGAACATGTGGTTTAATTCGATGATACGCGAGGAACCTTACCCGGGCTTAAATTGCAACAGAATATATTGGAAA
+CAGTATAGCCGTAAGG
+>546756
+TACGGAAGGTCCGGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCCGGAGATTAAGCGTGTTGTGAAATGTA
+GACGCTCAACGTCTGCACTGCAGCGCGAACTGGTTTCCTGAGTACGCAACAAAGTGGCGAATTCGTGGTGTAGCGGTGAA
+ATGCTTAGATATCAACG
+>555446
+TACGGAGGGTGCAAGCGTTAATCGGAATAACTGGGCGTAAAGGGCATGCAGGCGGTTCATCAAGTAGGATGTGAAATCCC
+CGGGCTCAACCTGGGAACAGCATACTAAACTGGTGGACTAGAGTATTGCAGGGGGAGACGGAATTCCAGGTGTAGCGGTG
+GAATGCGTAGATATCTGGAAGAACACCAAAGGCGAAGGCAGTCTCCTGGGCAAATACTGACGCTCATATGCGAAAGCGTG
+GGTAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGTCAATTAGGAGCTTGGGCGATAGTCTGGGTTC
+CGCAGCTAACGCAATAAATTGACCGCCTGGGGAGTACGGCCGCAAGGTTAAAACTCAAATGAATTGACGGGGGCCCGCAC
+AAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAAACCTTACAGGCTTGACATCTGACGAATCTGGATGAAAGTT
+CGGAGTGCTCTTCGGAG
+>274500
+ATACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGATGGATGTTTAAGTCAGTTGTGAAAGTT
+TGCGGCTCAACCGTAAATTGCAGTTGAATACTTGGATATCTTGAGTGCAGTTGAGGCAGGCGGAATTCGTGGTGTAGCGG
+TGAAATGCGTTAGGATA
+>236550
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGCGGTGCGGCAAGTCTGATGTGAAAGCCC
+GGGGCTCAACCCCGGTACTGCATTGGAAACTGTCGTACTAGAGTGTCGGAGGGGTAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTTACCGGAGCGCAACTGACGCTGAGGCTCGAAAGCGCG
+GTATCGAACAGAATTAGATACCCTGGTAGTCCGCGCGGTAAACGATGGATGCCCGCCGTTGGGATTTGGATTTCAGCGGC
+CAAGCGAAAGCGTTAAGCATCCCACCTGGGGAGTACGCCGGCAACGGTGAAACTCAAAGGAATTGACGGGGGCCCGCACA
+AGCGGAGGACATGTGGTTTAATCGATGATACGCGAGGAACCTTACCCGGGCTTGAATTGCAGGAGAACGATCCAGAGA
+>88301
+TACGTAGGATGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGCGGGACTGCAAGTTGGATGTGAAATACC
+GTGGCTTAACCACGGAACTGCATCCAAAACTGTAGTTCTTGAGTGAAGTAGAGGCAAGCGGAATTCCGAGTGTAGCGGTG
+AAATGCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCGGCTTGCTGGG
+>102382
+TACGTAGGTGGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGTGCGCAGGCGGGATTGCAAGTTGGATGTGAAATACC
+GGGGCTTAACCCCGGAGCTGCATCCAAAACTGTAGTTCTTGAGTGGAGTAGAGGTAAGCGGAATTCCGAGTGTAGCGGTG
+AAATGCGTAGATATTAGAAGGAACACCAGTGGCGAAGGCGACTTGCTGGGCCACCACTGACGGTCAGGGACGAAAGCGTG
+GGGAGCAAATAGGATTAGATACCCTAGTAGTCCACGCCGTAAACGATGAGTACTAAGTGTCGGTTTTCCGGTGCTGAAGT
+TAACACATTAAGTACTCCGCCTGAGTAGTACGGTCGCAAGGCTGAAACTCAAAGGAATTGACGGGCACCCGCACAAGCGG
+TGGAGCATGCTGTTTAATTCGAAAATACGACGAAGAACCTTACCTAGACTTGACATCCCTGGCAAAGCTAATAGAAATAT
+>322045
+TACGTAGGTGGCGAGCGTTATCCGGATTTATTGGGCGTAAAGCGTCCGCAGCCGGTTTATTAAGTCTAGAATTAAAGCCT
+GGAGCTCAACTCCAGTTCGTTTTAGAAACTGATAGACTCGAGTGTGGTAGAGGCAAACGGAATTTCTAGTGTAGCGGTAG
+AATGCGTAGATATTAGAAGGAACACCAGTGGCGAAGGCGGTTTGCTAGGCCACCACTGACGGTCATGGACGAAAGCGTGG
+GGAGCAAATAGGATTAGATACCCTAGTAGTCCACGCTGTAAACGATGAGTACTAAGTGTCGGGTTACCGGTGCTGAAGTT
+AACACATTAAGTACTCCGCCTGAGTAGTACGGTCGCAAGGCTGAACTCAAAGGAATTGACGGGGACCCGCACAAGCGGTG
+GAGCATGTGGTTTTAATTCGAAGCAACGCGAAGAACCTTACCAAGTCTTGACATCCCGATGACAGGGTATGTAATGTACT
+TTTCCCTTCGGGCATCGGTGACAGGTGGTGCATGGTTGTCGTCAG
+>133568
+TACGTAGGGGGCGAGCGTTATCCGGATTTATTGGGCGTAAAGCGTGTGTAGGCGGTTTGTTAAGTTTAAGATTAAAGCCC
+GGGCTCAACTCCGGTAAGTCTTAAAAACTGGCAGACTTGAGTACGGTAGAGGCAAACGGAATTTCTAGTGTAGCGGTGAA
+ATGCGTAGATATTAGAAGGAACACCAGTGGCGAAAGCGGTTTGCTGGGCCGTTACTGACGCTGAGGCACGAAAGCGTGGG
+GAGCAAATAGGATTAGATACCCTAGTAGTCCACGCCGTAAACGATGAGTACTAAGTGTTGGGTTACCCAGTGCTGAAGCT
+AACGTATTAAGTACTCCGCCTGAGTAGTACGGTCGCAAGGCTGAAACTCAAAGGAATTGACGGCACCCGCACAAGCGGTG
+GAGCATGCTGTTAATTCGAAGATACGCGAAGAACCTTACCTAGACTTGACATCCCCGGCAAAGCTATAAGAAATAAGTAG
+TAGGAAGGTTACCCGGGTGACGAGGTGG
+>97627
+AACGTAGGTCACAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAATCCA
+TGGGCTCAACCCATGAACTGCCTTCAAAACTGTTTTTCTTGAGTAGTGCAGAGGTAGGCGGAATTCCCGGTGTAGCGGTG
+GAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCCAACTGGGCTTTTACTGACGCTGAGGCTCGAAAGTGTG
+GGGAGCAAACAGG
+>578828
+TACGTAGGGAGCGAGCGTTGTCCGGATTTACTGGGTGTAAAGGGTGCGTAGGCGGGCTGTCAAGTCAGATGTGAAATACC
+GGGGCTCAACTCCGGGGCTGCATTTGAAACTGATGGTCTTGAGTGAAGTAGAGGCAGGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCCTGCTGGGCTTTAACTGACGCTGAGGCACGAAAGCATG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACACTGTAAACGATGAATGCTAGGTGTAGGGGGTATCGACCCCTTCT
+GTGCCGCAGTCAACACAATAAGCATTCCGCCTGGGGAGTACGGCCGCAAGGTTGAACTCAAGGAATTGACGGGGCCGCAC
+AAGCAGCGGAGCATGTGGTTTAATTCGACGCAACGCGAAGAACCTTACCAGGTCTTGACATCCACTAAACTTACAGAGAT
+GTAAGGTGTG
+>239064
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGAGTGGCAAGTCTGATGTGAAAACCC
+GGGGCTCAACCCCGGGACTGCATTGGAAACTGTCAATCTAGAGTACCGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGGTAACTGACGTTGAGGCTCGAAAGCGGT
+GGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGACTACTAGGTGTCGGGCAGCAAAGCTGTTCG
+GTGCCGCAGCAAAACGCAAATAAGTAGTCCACCTGGGGGAGTACGTTCGCAAGAATGAAACTCAAAAGGAATTGAACGGG
+ACCCGCACAAGCGGTGAGCATGTAGGTTTAATT
+>161340
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGACTGGCAAGTCTGATGTGAAAGGCG
+GGGGCTCAACCCCTGGACTGCATTGGAAACTGTTAGTCTTGAGTGCCGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGGTAACTGACGTTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATGATTACTAGGTGTGGGGGGACTGACCCCTTCCG
+TGCCGCAGTTAACACAATAAGTAATCCACCTGGGGAGTACGGCCGCAAGGTTGAAACTCAAAGGAATTGACGGGGCCCGC
+ACAAGCAGTGGAGTATGTGGTTTTAATTCGAAGCAACGCGAAGAACCTTACCAGGTCTTGACATCGAGTGCATAACTAAG
+AGATTAGTGAAATCCCTTCGGGGACACTTAGACAGGTGTGCA
+>235390
+TACGTAGGGGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGTGCGTAGGTGGTTACCTAAGCGCAGGGTATAAGGCA
+ATGGCTCAACCATTGTTGGCCCTGCGAACTGGGCTACTTGAGTGCAGGAGAGGAAAGCGGAATTCCTAGTGTAGCGGTGA
+AATGCGTAGATATTAGGAGGAGCACCAGTGGCGAAGGCGGCTTTCTGGACTGTAACTGACACTGAGGCACGAAAGCGTGG
+GAGCAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAGCACTAGGTGTCGGGGCCGCAAGGCTTCGGTGCC
+GCAGTCAACGCATTAAGTGCTCCGCCGTGGGAGTACGCACGCAAGTGTGAAACTCAAGGAATTGACGGGGACCGCACAAG
+CAGCGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAGGACTTGACATCCCTCTGACAGGACCTTAACCG
+GTTCCTTCGTACGGACAGAGGAGAC
+>141836
+TACGGAGGATGCAAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGCACGCCAAGTCAGCGGTGAAATTTC
+CGGGCTCAACCCGGAGTGTGCCGTTGAAACTGGCGAGCTAGAGTACACAAGAGGCAGGCGGAATGCGTGGTGTAGCGGTG
+AAATGCATAGATATCACGCAGAACCCCGATTGCGAAGGCAGCCTGCTAGGGTGAAACAGACGCTGAGGCACGAAAGCGTG
+GGTATCGAACAGGATTAGATACCACGTGGTACGTACCGACGACGTAAACGGTAAACGTAGTAAGTAAACTTAAACCTTAA
+CGTTTGGTTTCGGACGTAACTAAACTAAGTTAAGGTAACGGGTACCGGTAACGACGGAAACGGAAACGGTTAAC
+>339015
+AACGTAGGTCACAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAATCTA
+TGGGCTCAACCCATAAACTGCTTTCAAAACTGTTTTTCTTGAGTAGTGCAGAGGTAGGCGGAATTCCCGGTGTAGCGGTG
+GAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGCCTACTGGGCACCAACTGACGCTGAGGCTCGAAAGTGTG
+GGTAGCAAACAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATGGATACTAGGTGTGCGGGACTCGA
+>143699
+TACGGAGGATGCAAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGCACGCCAAGTCAGCGGTGAAATTTT
+CGGGCTCAACCCGGACTGTGCCGTTGAAACTGGCGAGCTAGAGTGCACAAGAGGCAGGCGGAATGCGTGGTGTAGCGGTG
+AAATGCATAGATATCACGCAGAACCCCGATTGCGAAGGCAGCCTGCTAGGGTGCGACAGACGCTGAGGCACGAAAGCGTG
+GGTATCGAACAGGATTAGATACCCTGGTAGTCCACGCAGTAAACGATGAATACTAACTGTTTGCGATACAATGTAAGCGG
+TACAGCGAAAGCGTTAAGTATTCCACCTGGGGATACGCCGGACAACGGTGAAACTCAAAGGAATTAGACGGGGGCCCGCA
+CAAGCGGAGGAACATGTGGTTAATTCGATGATACGCGAGGAACCTTACCGGGCTCAAACGCAGGGGGAATATATATGAAA
+GTATATAGCTAGCAATAGTC
+>114783
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGCGGTGCGGCAGGTCTGATGTGAAAGCCC
+GGGGCTCAACCCCGGTACTGCATTGGAAACTGTCGTACTAGAGTGTCGGAGGGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGATTGCTGGACGACAACTGACGGTGAGGCGCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATGAATACTAGGTGTGCGGGGACTGACCCCCTGCG
+TGCCGCAGCTAACGCAATAAGTATTCCACCTGGGGAGTACGATCGCAAGGTTGAAACTCAAAGGAATTAGACGGGGGCCC
+GCGACAAGCGGTGGATTATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAGGGCTTGACATCCTACTAACGAAGT
+AGAGATACATC
+>164557
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGTGCGTAGGTGGCAAGGCAAGTCTGAAGTGAAAAATC
+CGGGGCTCAACCCCGGAACTGCTTTGGAAACTGTTTAGCTGGAGTACAGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGT
+GAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGACTTACTGGACTGCTACTGACACTGAGGCACGAAAGCGT
+GGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGACTGCTAGGTGTCGGGTGGCAAAGGCCATTC
+GGTGCCGCAGCTAACGCAATAAGC
+>140805
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGTTTTGCAAGTCTGAAGTGAAAGCCC
+GGGGCTTAACCCCGGGACTGCTTTGGAAACTGTAGAACTAGAGTGCAGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACTGTAACTGACGTTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGGTATTAGACTACCCGTGGTACGT
+>196433
+TACGTAGGGAGCGAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGTAGGCGGGATCTTAAGTCAGGTGTGAAAACTA
+TGGGCTCAACCCATAGACTGCACTTGAAACTGAGGTTCTTGAGTGAAGTAGAGGCAGGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACATCAGTGGCGAAGGCGGCCTGCTGGGCTTTTACTGACGCTGAGGCTCGAAAGCGTG
+GGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATGGATACTAGGTGTGGGGGGTCTGACCCCCTCCGT
+GCCGCAGTTAACACAATAAGTATCCCACCTGGGGAGTACGATCGCAAGGTTGAAACTCAAAGGGAATTAGACGGGGGGCC
+CGCACGAAGCGGTGGAGTAATGT
+>9510
+TACGTAGGGGGCGAGCGTTGTCCGGAATTACTGGGCGTAAAGGGTGCGTAGGCGGTTGCTTAAGTTGGATGTGAAATACC
+CGGGCTTAACTTGGGGGGTGCATTCAAAACTGGGCGACTAGAGTTCAGGAGAGGGAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTTCTGGACTGACACTGACGCTGAGGCACGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATGAATACTAGGTGTAGGGGGTATGAACTCCCTCT
+GTGCCGTAGCAAACGCAATAAGTATTCCGCCTGGGGAGTACGGCCGCAAGGTTGAAACTCAAAGGAATTGACGGGGGCCC
+GCACAAGCAGCGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAAGGCTTGACATCCTGCTAAAGTCATG
+GAAACATGATGTCCCTTCGGGGGAGCAGAGACAGGTGGTGC
+>55799
+TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGCAGCCGGGTCTGCAAGTCAGATGTGAAATCCA
+TGGGCTCAACCCATGAACTGCATTTGAAACTGTGGATCTTGAGTGTCGGAGGGGCAATCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGATTGCTGGACGATAACTGACGGTGAGGCGCGAAAGTGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACACTGTAAACGATGAATACTAGGTGTGCGGGGACTGACCCCCTGCG
+TGCCGCAGTTAACACAATAAGTATTCCACCTGGGGAGCACGATCGCAAGGTTGAAACTCAAA
+>72374
+AATTGGACATTCCCGCCGAGAGCATCCGCGCGGATGTCGAACGGAAGCTGCGCAAAAGAAAGCGAATACAAAAAGAGCAG
+TCGCAGAAGATCGTCCGCCAAAGCGCCGGGTACGGCGACCGGGTCAACCCCGATTTCGCCAAGAATGTCGCCGCTGCAAA
+GGCAGAAGAGGCAGTCCTCGGCATGCTGCTCCTTTACCCCGAGCACCGCGAGGCGGTGCGGACGGGGAAGGT
+>224569
+TACGGAAGGTCCGGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCCGGAGATTAAGCGTGTTGTGAAATGTA
+GATGCTCAACATCTGCACTGCAGCGCGAACTTGGTTTCCTTGAGTACGCAACGAAAGGTGGGACGGTAATTCGTGGTGTA
+GCGGTAGAAATGCTTAGATA
+>10113
+TACGTAGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGCGGTCTGGCAAGTCTGATGTGAAATCCCG
+GGGACTCAACTCCGGAATTGCATTGGAAACTGTCAGACTAGAGTGCCGGAGAGGTAAGTGGAATTCCCGTGTAGCGGTGG
+AATAGCGTAGATATCGGAGGAACACCACGTGGCGAAGGACGGGCCTACTGGGCACCAACTGACGCTGAGGCTCGAAAGTG
+TGGTAG
+>8389
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGAGAGCGCAGGCGGTGCGGCAAGTCTGATGTGAAAGCCC
+GGGGCTCAACCCGGTACTGCATTGGAAACTGTCGTACTAGAGTGTCGGAGGGGTAAGTGGAATTCCTAGTGTAGCGGTGA
+AATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGATAACTGACGCTGAGGCTCGAAAGCGTGG
+GGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGGAGCATTGCTCTTCGGT
+GCCGCAGCAAACGCAATAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGATTGACGGGGACCCGCA
+CAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGGCGAAAGAACCTTACCAAGTCTTAGACACTCCCGAATGACAGAG
+TATGTAATGTACTTTCTC
+>362373
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGTGGATTGTTAAGTCAGTTGTGAAAGTTT
+GCGGCTCAAACCGTAAAAATTGCAGTTGAAACTGGGAGTCTTGAGTACAGTAGAGGTGGGCGGAATTCGTGGTGTAGCGG
+TGAAAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTCACTAGACTGTCACTGACACTGATGCTCGAAAAG
+TGTGGTATCAAAACAGGATTAGATACCCTGGTAGTCCACAC
+>139641
+TACGTAGGGAGCAAGCGTTATCCGGATTTATTGGGTGTAAAGGGTGCGTAGACGGGAAGGTAAGTTAGTTGTGAAATCCC
+TCGGCTCAACTGAGGAACTGCGACTAAAACTGCTTTTCTTGAGTGCTGGAGAGGAAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGACAGTAACTGACGTTGAGGCACGAAAGTGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACACAGTAAACGATGAATACTCGCTGTTTGCGATATACAGTAAGCGG
+CCAAGCGAAAGCGTTAAGTATTCCACCTGGGGAGTACGCCGGCAACGGTGAAACTCAAAGGAATTGGACGGGGGCCCGCA
+CAAGCGGAGGAACATGTGGTTTAATTCGATGA
+>125536
+TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGCGTAGGCGGGATGGCAAGTCAGATGTGAAATCCA
+TGGGCTCAACCCACGAACTGCATTTGAAACTGTCGTTCTTGAGTATCGGAGAGGCAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTGCTGGACGACAACTGACGCTGAGGCGCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGGCCGTAGAGGCTTCGG
+TGCCGCAGCCAACGCAATAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGGACCCG
+CACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCTGGTCTTGACATCCTTCTGACCGGTCCT
+TAACCGGACCTTTTCCTTC
+>103157
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGATGGATGTTTAAGTCAGTTGTGAAAGTTT
+GCGGCTCAACCGTAAAATTGCAGTTGATTACTAGGATTATCTTGAGTGCAGTTGAGGCAGGCGGAATTCGTGGTGTAGCG
+GTGAAATGCTTAGGATATCA
+>245893
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGCGGTGCGGCAAGTCTGATGTGAAAGCCC
+GGGGCTCAACCCCGGTACTGCATTGGAAACTGTCGTACTAGAGTGTCGGAGGGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGATAACTGACGCTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTTGGGAAGCATTGCTTCTCGG
+TGCCGTCGCAAACGCAGTAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGAATTGACGGGGGCCCGC
+ACAAGCAGTGGAGTATGTGGTTTAATTCGACGCAACGCGAAGAACCTTACCAAGTCTTGACATCCCTTGACGATGCTGGA
+ACAGTATTTCTCTTTCGGAGCAAGGAGACAGG
+>144746
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCGGATTGTTAAGTCAGTTGTGAAAGTTT
+GCGGCTCAACCGTAAAATTGCAGTTGATACTGGCAGTCTTGAGTGCAGTAGAGGTGGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCACACTAATCCGTAACTGACGTTTCATGCTCGAAAGTGT
+GGGTATCAAACAGGATTAGATACCCTGGTAGTCCACACGGTAAACGATGGATACTCGCTGTTGGCGATATACTGTCAGCG
+GCTTAGCGAAAGCGTTAAGTATCCCACCTGGGGAGTACGCCGGCAACGGTGAAACTCAAAGGAATTGACGGGGGCCCGCA
+CAAGCGGAGGAACAGTAGTGGTTAATTCGATGA
+>278398
+TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGTAGGCGGGATGGCAGGTCAGATGTGAAATACC
+CGGGCTTAACCCGGGGGGTGCATTTGAAACCGCCGTTCTTGAGTGCCGGAGAGGGAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTTCTGGACGGTAACTGACGCTGAGGCGCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGGATACTAGGTGTAGGAGGTATCGACCCCTTCT
+GTGCCGCAGTAAACACAATAAGTATCCCACCTGGGGAGTACCCCCGGCGCAAGGTTGAAACTCAAAGGATTGACGGGGCC
+CGCACAAGCAGTGGAGTATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAGGGCTTGACATCCCGCTGA
+>97508
+TTCTTGATACAGTCTTTGTCCCGTAATTTTTCCCGTAGTCCGTTTCCTTTTTCTATATATCCTTCATCGAAGATATTTCG
+TCCGAAATTCTTTATCTCACCGGAAACATTCCACCTCTTGTCATCGGCTATTCTTTCTTCTATGTATTCCAACAGCCAAT
+AGAGTACGGGAGATTGTCGGTCCAGCTTTTCTATCATGGAGTCCACGGCATCACTCAGAACTTCCATATTGTTCAGTTCG
+ATATTCAGATTGGCACCCAATTCCAGCTCCCGTGCCAAATTGCGCATCACCGACTGGAAGAATGAGTCAATGGTTTCCAC
+CCGGAAACGGCTGTAATCGTGTATCATGTAGTGCAGTGCCGTACCGGCGGCTGTCCCTATGTCTTCTTGGGGCATTTCCA
+GTTCTTCTGTGATTTTCT
+>248442
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGATGGGTGTTTAAGTCAGTTGTGAAAGTTT
+GCGGCTCAACCGTAAAATTGCAGTTGATACTGGATATCTTGAGTGCAGTTGAGGCAGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCAACGAAAGAACTCCGATTGCGAAGGCAGCCTGACTAAGC
+>269778
+ATACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAAGGGAGCGTAGGCGGATTGTTAAGTCAGTTGCGAAAAG
+TTTGCGGCTCAACCGTAAAATTGCAGTTGATAACTGGTCAGTCTTGAGTGCAGTAGAGGTGGGCGGAATTCGTGGTGTAG
+CGGTGAAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCCTCAC
+>312030
+AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAAAACC
+ATGGGCTCAACCCATGAATTGCTTTCAAAACTGTTTTTCTTGAGTAGTGCAGAGGTAGATGGAATTCCCGGTGTAGCGGT
+GGAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGTCTACTGGGCACCAACTGACGCTGAGGCT
+>573761
+AACGTAGGTCACAAGTGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAATCCA
+TGGGCTCAACCCATGAACTGCTTTCAAAACTGTTTTTCTTGAGTAGTGCAGAGGTAGGCGGAATTCCCGGTGTAGCGGTG
+GAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGCCTACTGGGCACCAACTGACGCTGAGGCTCGAAAGTGTG
+GGTAGCAAACAGGATTAGATACCCTGGTAGTCCACACTGTAAACGATGATTACTAGTGTTGGAGGATTGACCCTTCAGTG
+CCGCAGTTAACACAATAAGTATTCCACCTGGGGAGTACGCCGGCAACGGTGAAAACTCAAAAGGAATTGACGGGGGCCCG
+CACAAGCGGAGGAACATGTGGTTTAATTCGATGATACG
+>583669
+TACGTAGGGGGCAAGCGTTGTCCGGAATAATTGGGCGTAAAGGGCGCGTAGGCGGCTCGGTAAGTCTGGAGTGAAAGTCC
+TGCTTTTAAGGTGGGAATTGCTTTGGATACTGTCGGGCTTGAGTGCAGGAGAGGTTAGTGGAATTCCCAGTGTAGCGGTG
+AAATGCGTAGAGATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACTGTAACTGACGTTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTTGGGTTTCATAAGAAGCTCG
+GTGCCGGCGCAAACGCATTAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGATTGACGGGGACCCG
+CACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAAGTCTTGACATCCCGTTGACCTGTTAT
+GTAATGTAACA
+>139752
+TACGTAGGGGGCAAGCGTTGTCCGGAATAATTGGGCGTAAAGGGCGCGTAGGCGGCTCGGTAAGTCTGGAGTGAAAGTCC
+TGCTTTTAAGGTGGGAATTGCTTTGGATACTGTCGGGCTTGAGTGCAGGAGAGGTTAGTGGAATTCCCAGTGTAGCGGTG
+AAATGCGTAGAGATTGGGAGGAACACCAGTGGCGAAGGCGACTAACTGGACTGTAACTGACGCTGAGGCGCGAAAGTGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATGCTAGGTGTAGGGGCATTAAGCTTCTGTG
+CCGCAGTTAACACAATAAGCATTCCACCTGGGGAGTACGGCCGCAAGGTTGAAACTCAAAGGAATTGACGGGAGCCCGCA
+CAAGCAGTGGAGTATGTGGTTTAATTCGACGCAACGCGAAGAACCTTACCAGGTCTTGACATACCTGAGAACCCGGACGT
+AAAGTGGCTGGGGTGCCCGTTCGGGGAATTTCGAGAGACGAGGTGGTGCAG
+>315848
+TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGTAGCCGGGAGGGCAAGTCAGATGTGAAATCCA
+CGGGCTCAACTCGTGAACTGCATTTGAAACTACTCTTCTTGAGTATCGGAGAGGCAATCGGAATTCCTAGCGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGATTGCTGGACGACAACTGACGGTGAGGCGCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATGGATACTAGATGCAGGAGGTATTCACTCCTTCT
+GTGTCGCAGCTAACGCAATAAGTATCCCGCCTGGGGAGTACGGCCGCAAGGTTGAAACTCAAAGGAATTGACGGGGCCGC
+ACAAGCAGCGGAGCATGTGGTTTATTCGACGCAACGCGAAAAACCTTACCAAGACTTGACATCGTAT
+>516971
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGATGGATGTTTAAGTCAGTTGTGAAAGTTT
+GCGGCTCAACCGTAAAATTGCAGTTGATACTGGATATCTTGAGTGCAGTTGAGGCAGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGGTTGCGAAGGCAGGCCTAGCTAAGCTGC
+>267568
+TACGGAAGGTCCGGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCCGGAGATTAAGCGTGTTGTGAAATGTA
+GATGCTCAACATCTGAACTGCAGCGCGAACTGGTTTCCTTGAGTACGCACAAAGTGGGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGATAACTGACGCTGAGGCTCGAAAGCGTG
+GGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGAGCATTGCTCTTCGGTG
+CCGCAGCAAACGCAATAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGGACCGCAC
+AAGCGGTGGAGCATGTGGTTAATTCGAAGCAACGCGAAGAACCTTACAAGTCTTGACATCCCGATGACAAGCTATGTAA
+>536009
+TACGTAGGTGGCGAGCGTTGTCCGGAATTACTGGGTGTAAAGGGTGCGTAGGCGGCTTCTAAAGTCAGATGTGAAATACC
+GCAGCTCAACTGCGGGGCTGCATTTGAAACTTGGGAGCTTGAGTGAAGTAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+GAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGATTGCTGGACGATAACTGACGGTGAGGCGCGAAAGTGTG
+GGAGCAAACAGGATTAGATACCCTGGTAGTCCACACTGTAAACGATGAATACTAGGTGTGCGGGGACTGACCCCCTGCGT
+GCCGCAGTAAACACAATAAGTATTCCACCTGGGGAGTACGATCGCAAGGTTGAAACTCAAAGGAATTGACGGGGGCCCGC
+ACAAGCGGTGGATTATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAGGGTTTGACATCCTGCTAACGAAGTAGA
+GATACATTAGGTGCCCTTCGGGGAAAGCAGAGAC
+>275935
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGTGTAGGTGGCTGTGCAAGTCAGGAGTGAAAGCCC
+GGGGCTCAACCCCGGGACTGCTCTTGAAACTGTGCGGCTTGAGTGCAGGAGGGGCAGGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCGGTGGCGAAGGCGGCCTGCTGGACTGTAACTGACACTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGGCTCATAAGAGCTTCG
+GTGCCGCAGCAACGCAATAAGTATTCCACCTGGGGAGTTACGTTCGCAAGAAT
+>558838
+TACGTAGGGAGCAAGCGTTGTCCGGAATTACTGGGCGTAAAGGGTGCGTAGGCGGCCGTTTAAGTCAGATGTGAAATACC
+CGTGCTTAACATGGGGGCTGCATTTGAAACTGGATGGCTTGAGTGCAGGAGAGGCAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGACTTGCTGGACTGTAATTGACGCTGAGGC
+>181819
+TACGTAGGGAGCAAGCGTTATCCGGATTTATTGGGTGTAAAGGGTGCGTAGACGGTAATGCAAGTTAGTTGTGAAATCCC
+TCGGCTTAACTGAGGAACTGCAACTAAAACTACATTTCTTGAGTATTGGAGGGGAAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCGGTGGCGAAGGCGACTTTCTGGGCAATTACTGACGTTGAGGCACGAAAGTGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACACCGTAAACGATGGATACTAGGTGTAGGATGTGATAAACATTCTG
+TGCCGTCGCAAACGCAAATAAGTACTCCCACCGTGGGGGAGTACCGGGCCGCAAGGTTGAAACTCAAAAGGAATTGGACC
+GGGGGCCCGCACAAGCAGTGGAGTATGTGGTTTAATTCGACGCAACGCGAAGAACCTTACCAGGGCTTGACATATAGAGG
+AAATAAGCTAGAGA
+>141423
+TACGTAGGGAGCGAGCGTTATCCGGATTTATTGGGTGTAAAGGGTGCGTAGACGGGAAGTCAAGTTAGTTGTGAAATCCC
+TCGGCTTAACTGAGGAACTGCAACTAAAACTGATTTTCTTGAGTACTGGAGAGGAAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCGGTGGCGAAGGCGACTTTCTGGACAGAAACTGACGTTGAGGCACGAAAAGTGT
+GGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTAGGGGTTGTCATGACCTCT
+GTGCCGCCGCTAACGCATTAAGTATTCCGCCTGGGGAGTACGGTCGCAAGAATTAAAACTCAAAGGAATTCGACGGGGGC
+CCGCACAAGCAGCGGAGCATTGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCTAGAC
+>8096
+TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGTAGCCGGGAGGGCAAGTCAGAGTGTGAAAATC
+CCACGGGCGTCAACTCCGTGAACTGCATTTGAAACTAACTCTTCTTCGAGTATCGGAGAGGCAATCGGAATTCCTAGTGT
+AGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGATTGCTGGACGACAACTGACGGTGAGGCG
+>142784
+GTATAAGAGTTTCGTACTTTATCTCGGTATCTGAGATGTTCTCCTCGGGACGGAGTTCCTCGCTGCCCTTTGTGAACTGC
+CACGATCTCAGTTCAATATCAGCGCCGGAAAATACCAGATAGATATTGTGATTTCTGCCGTCAAACTGACTGAATTCAGC
+AGAACGAACCTTAGCATAATCAGCGTTGTCAAATTCAATGAAAGATACTGCCTCGCCGGAAATGCTGTCAAGGCGAACCT
+CGATTCTGCCCTTGCCTTTAACATCAGCAGTGAAAGCGGAAGCGCCATAACCGAAAT
+>343119
+TACATAGGGTGCAAGCGTTATCCGGAATTATTGGGTGTAAAGGGTGCGTAGACGGGAAAACAAGTTAGTTGTGAAATCCC
+TCGGCTCAACTGAGGAACTGCAACTAAAACTATTTTTCTTGAGTGTCGGAGAGGAAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGACGATAACTGACGTTGAGGCACGAAAGTGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACACCGTAAACGATGGATACTAGGTGTAGGGTGTATTAAGCATTCTG
+TGCCGCCGCTAACGCATTAAGTATCCCACCTGGGGAGTACGACCGCAAGGTTGAAACTCAAAGGAATTGACGGGGCCCGC
+ACAAGCAGTGGAGTATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAGGGCTTGACATATACCGGAAT
+>511371
+TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGTAGCCGGGAGGGCAAGTCAGATGTGAAATCCA
+CGGGCTCAACTCGTGAACTGCATTTGAAACTACTCTTCTTGAGTATCGGAGAGGCAATCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACATCAGTGGCGAAGGCGGCTTACTGGGCTTTAACTGACGCTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGATTACTAGGTGTGGGGGGACTGACCCCTTCGT
+GCCGCAGCAAACGCAATAAGTAATCCACCGTGGGGAGTACGACCGAC
+>279470
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGACGGAAGGCTAAGTCGTGATGTGAAAGC
+CCGGGGCTCAACCCCGGTACTGCATTGGAAACTGGTCATCTAGAGTGTCGGAGGGGTAAGTGGAATTCCTAGTGTAGCGG
+TGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGGCGATAACTGACGCTGAGGCTCGAAAGCG
+TGGGAGCAAACAGGATTAGATACCCTGGTA
+>546723
+TACGTAGGTGGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGTAGGCGGGATGCCAAGTCAGCTGTGAAAACTA
+TGGGCTTAACTTGTAGACTGCAGTTGAAACTGGTATTCTTGAGTGAAGTAGAGGTTGGCGGAATTCCGAGTGTAGCGGTG
+AAATGCGTAGATATTCGGAGGAACACCGGTGGCGAAGGCGGCCAACTGGGCTTTAACTGACGCTGAGGCTCGAAAGTGTG
+GGTAGCAAACAGGATTAGATACCCTGGTAGTCCACACCGTAAACGATGATTACTAGGTGTTGGAGGATTGACCCCTTCAG
+TGCCGCAGTTAACACAATAAGTAATCCACCTGGGGATACGACCGCAAGGTTGAAACTCAAAGGAATTGACGGGGGCCCGC
+ACAAGCAGTGGAGTATGTGGTTTAATTCGACGCAACGCGAAGAACCTTACCAAGTCTTGACATCCTGCGACGGACATAAG
+AAATTATGTCTTTCCTTCGGGACGCAGAGACAGGTGGTG
+>454435
+TACGTAGGGAGCGAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGTAGGCGGGATCGCAAGTCAGATGTGAAAACTA
+TGGGCTTAACCCATAAACTGCATTTGAAACTGTGGTTCTTGAGTGCAGTTGAGGCAGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACCCCGATTGCGAAGGCAGCTTGCTCAACTGTAACTGACGTTCATGCTCGAAAGTGTG
+GGTATCAAACAGGATTAGATACCCTGGTAGTCCACACGGTAAACGATGGATACTCGCTGTTGGCGATATACTGTCAGCGG
+CCAAGCGAAAGCATTAAGTATCCCACCTGGGGAGTACGCCGGCAACCGGTGAAACTCAAAGGATTGACGGGGCCCGCACA
+AGCGGAGGAACATGTGG
+>257199
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCCGTGCAAGTCTGATGTGAAAGGCT
+GGGGCTCAACCCCGGGACTGCATTGGAAACTGTATGGCTGGAGTGCCGGAGAGGGTAAGCGGAATTCCTAGTGTAGCGGT
+GAAATGCGTAGATATTAGGAGGAACACCAGTAGCGAAGGCGGCTTTCTGGACTGTAACTGACACTGAGGCACGAAAGCGT
+GGGGAGCAAACAGGATTAAGATACCCTGGAGTCCACGCCGTAAACGATGAGTACTAGGTGTCGGGGGTTACCCCCCTCGG
+GTGCCGCAGCTAACGCATTAAGTACTCCG
+>238279
+TACGTATGGAGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGTGTAGGTGGCCATGCAAGTCAGAAGTGAAAATCC
+GGGGCTCAACCCCGGAACTGCTTTTGAAACTGTAAGGCTAGAGTGCAGGAGGGGTGAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTCACTGGACTGTAACTGACACTGAGGCACGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTTGGGTTCCAAAGGGGACTCG
+GTGCCGTCGCAAACGCATTAAGTATTCCACTGGGGGTACTTCGCAAGAATGAAACTC
+>362382
+TACGTATGGAGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCACGGCAAGCCAGATGTGAAAGCCC
+GGGGCTCAACCCCGGGACTGCATTTGGAACTGCTGAGCTAGAGTGTCGGAGAGGCAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTGCTGGACGATGACTGACGTTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGACTGCTAGGTGTCGGGTAGCAAAGCTATTCGG
+TGCCGCAGCTAACGCAATAAGCAGTCCACCGTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGACCCG
+CACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCTGATTCTTGACATCCCGATCGACCGTCT
+T
+>305967
+TACGTAGGGAGCGAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGTAGGCGGGATCGCAAGTCAGATGTGAAAACTA
+TGGGCTTAACCCATAAACTGCATTTGAAACTGTGGTTCTTGAGTGAAGTAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAAACATCAGGTGGACGGAA
+>349780
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGGCGGTCTGGCAAGCCAGAAGTGAAAGCCC
+GGGGCTTAACCCCGGGACTGCTTTTGGAATTGTTAGACTAGAGTGTCGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGATAACTGACGCTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGGATGCCGCCGTTGGGATTTGGATTTCAGCGGC
+CAAGCGAAAGCGTTAAGCATCCCACCTGGGGAGTACGCCGGCAACGGTGAAACTCAAAGGAATTGACGGGGCCCGCACAA
+GCGGAGGAACATGTGGTTTAATTCGATGATACGCGAGGAACCTTACCCGGGCTTGAATTGCAGGAGAACGATCCAGAGAT
+GGTGAGGCCCTTCGGGGCTCCGTGTGAAGGTGC
+>512616
+TACGTAGGTGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGTGTGTAGGCGGGACGACAAGTCAGATGTGAAACTCA
+TGGGCTCAACCCATGACCTGCATTTGAAACTGCCGTTCTTGAGAGTCGGAGAGGTAAATGGAATTCCCGGTGTAGCGGTG
+AAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGATTTACTGGACGACCACTGACGCTGAGACACGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATGCTAGGTGTAGGGGCAGTTAAGCTTCTGT
+GCCGCAGTTAACACAATAAGCATTCCACCTGGGAGTACGGCCGCAAGGTTGAAACTCAAAGGAATTGACGGGAGCCCGGC
+ACAGCAGTGGAGTATGTGGTTTAATTCGACGTCAACGCGAAGGAACCCTTACCGAGGTACTTGACA
+>593006
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCATCACAAGTCAGAAGTGAAAATCC
+GGGGCTCAACCCCGGAACTGCTTTTGAAACTGTGGAGCTGGAGTGCAGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGATGACTGACGCTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGGAGCACAGCTCTTCGG
+TGCCGCAGCAAACGCAGTAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGGACCCG
+CACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAAGTCTTGACATCCTTCTGACAGAGTAT
+GTAATGTACTTTCCCTTCGGGGCAGAAGTGACAGG
+>292288
+AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGACCGGCAAGTTGGATGTGAAATCCA
+CGGGCTCAACTCGTGAACTGCATTTGAAACTACTCTTCTTGAGTATCGGAGAGGCAATCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGATTGCTGGACGACAACTGACGGTGAGGCGCGAAAGCGTG
+GGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATGAATACTAGGTGTGCGGGGACTGACCCCCTGCGT
+GCCGCAGCTAACGCAATAAGTATTCCACCTGGGGAGTACGATCGCAAGGTTGAAACTCAAAGGATTGACGGGGGCCCGCA
+CAAGCGGTGGATTATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAGGGCTTGACATCCTACTAACGAGATAGAG
+ATATGTT
+>166871
+TACGTAGGGGGCAAGCGTTGTCCGGAATAATTGGGCGTAAAGGGCGCGTAGGCGGCCCGGTAAGTCTGGAGTGAAAGTCC
+TGCTTTTAAGGTGGGAATTGCTTTGGATACTGTCGGGCTTGAGTGCAGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACTGACAATGACGCTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGGGCCATAAGGCTTTCG
+GTGCCGCAGCAAACGCAATAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGACCCG
+CACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAGGCCTTGACATCCCGGTGACCGTCCCG
+TAATGGGGACCTCTCTTCGGAGCACCGGTGACAGGTGGTGCATGGTTG
+>565944
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCGCAGCAAGTCTGATGTGAAAGGCA
+GGGGCTTAACCCCTGGACTGCATTGGAAACTGCTGTACCTGAGTGCCGGAGGGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGGTAACTGACGTTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAAATACTAGGGTGTCAGGGAGCGACTAGCTTC
+TTTGGTGCCGCCGC
+>510626
+TACGTAGGGGGCGAGCGTTATCCGGATTTATTGGGCGTAAAGCGTGCGTAGGCGGTTTATTAAGTCTGGAATTAAAGCCC
+GAGGCTTAACCTCGGTTCGTTCTAGATACTGGTTGACTAGAGTACAGTAGAGGCAAATGGAATTCCTAGTGTAGCGGTGG
+AATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGATTTGCTGGGCTGTAACTGACGCTGAGGTACGAAAGCGTGG
+GTAGCAAATAGGATTAGATACCCTAGTAGTCCACGCTGTAAACGATGAGCACTAAGTGTCGGGTTACCGGTGCTGAAGTT
+AACACATTAAGTGCTCCGCCCTGAGTAGTACGGTCGCAAGACTGAAACTCAAAGGAATTGACGGGCACCCGCACAAGCGG
+TGGAGCATGTTGTTAATTTCGAAAATACGCGAAGAACCTTACCTAGCTTGACATCCCCTGGCAAAAGCTAATAGAAAATA
+TAAGTGGGAGGTTATACCA
+>351629
+AACGTCGGTCACAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGTGATCAAGTTGGAAGTGAAATCCA
+TGGGCTCAACCCATGAACTGCTTTCAAAACTGATCGTCTTGAGTAGTGCAGAGGTAGGCGGAATTCCCGGTGTAGCGGTG
+GAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGCCTACGTGGGCACCAACTGACGCTGAGGCTCGAAAGTGT
+GGGTAGACAAACAGGATTAGACTACCCTGGTACGTCCACAACCGTAAACGAT
+>202238
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGTGGATTGTTAAGTCAGTTGTGAAAGTTT
+GCGGCTCAACCGTAAAATTGCAGTTGAAACTGGCAGTCTTGAGTACAGTAGAGGTGGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACCTCGATTGCGAAGGCAGCTCACTAGACTTGTTACTGACACTGATGCTACGAAAGTG
+GTGGGTATACAAACGAGG
+>102209
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGGTGGCAAGGCAAGCCAGAAGTGAAAACCC
+GGGGCTCAACCGCGGGATTGCTTTTGGAACTGTCATGCTAGAGTGCAGGAGGGGTGAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCGGAGGCGAAGGCGGCTCACTGGACTGTAACTGACACTGAGGCTCGAAAGCGTG
+GGAGCAAACAGGATTAGATACCCTGGTAGTCCACATGGTAAACGATGGATACTCGCTGTTGGCGATATACTGTCAGCGGC
+CAAAGCGAAAGCATTAAGTATCCCACCTGGGGAGTACGCCGGCAACGGTGAAACTCAAAGGATTGACGGGGCCGCACAAG
+CGGAGGAACATGTGGTTTAATCGATGATACGCGAGGAACCTTACCGGGCTTAAATTGCAGACGAATTACGAGGAAACTTG
+TAAAGCCGCAAGGCGTCGTGTGTAAGGTG
+>93610
+TACGTAGGGAGCGAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGTAGGCGGGATCGCAAGTCAGATGTGAAAACTA
+GTGGGCCTTAACCATAGACTGCATTTGAAACTGTGGTTCTTGAGTGAAGTAGAGGTAAGCGGAATTCCGTAGTGTAGCGG
+TGAAATGCGTAGA
+>205713
+TACGTAGGTGGCGAGCGTTATCCGGATTTACTGGGTGTAAAGGGTGTGTAGGCGGGACTGCAAGTCAGATGTGAAAACTA
+TGGGCTCAACTCATAGCTTGCATTTGAAACTGTGGTTCTTGAGGGTCGGAGAGGTAACTGGAATTCCCGGTGTAGCGGTG
+AAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGAGTTACTGGACGATACCTGACGCTGAGACACGAAAGTGTG
+GGGAG
+>519193
+TACGTAGGGGGCAAGCGTTGTCCGGAATAATTGGGCGTAAAGGGCGCGTAGGCGGCTCGGTAAGTCTGGAGTGAAAGTCC
+TGCTTTTAAGGTGGGAATTGCTTTGGATACTGTCGGGCTTGAGTGCAGGAGAGGTTAGTGGAATTCCCAGTGTAGCGGTG
+AAATGCGTAGAGATTGGGAGGAACACCAGTGGCGAAGGCGACTAACTGGACTGTAACTGACGCTGAGGCGCGAAAGTGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACACTGTAAACGATGAATGCTAGGTGTAGGGGGTATCGACCCTTCTG
+TGCCGCAGTTAACACAATAAGCATTCCGCCTGGGGAGTACGGCCGCAAGGTTGAAACTCAAAGGAATTGACGGGGGCCCG
+CACAAGCAGTGGAGTATGTGGTTTAATTCGACGCAACGCGAAGAACCTTACCAAGTCTTGACATCCTGCGACGGTGCTAG
+AAATAGTATTTCCTTCGGGACGCAGAGACAGGTGGTGC
+>240591
+TACGTATGGAGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGTGTAGGTGGCCAGGCAAGTCAGAAGTGAAAGCCC
+GGGGCTCAACCCCGGGACTGCTTTTGAAACTGCAGGGCTAGAGTGCAGGAGGGGCAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTGCTGGACTGTAACTGACACTGAGGCTCGAAAGCGTG
+GGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGGCACATAAGTGCTCCGG
+TGCCGCAGCAAACGCAATAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGGACCCG
+CACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAAGTCTTGACATCCTCTTGACCGGTCAG
+TAATGTGACCTTTTTCTTCGGAACAAGAGTGACA
+>223711
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGTGCGTAGGTGGCAAGGCAAGTCTGAAGTGAAAAATC
+CGGGGCTCAACCCCGGAACTGCTTTGGAAACTGTTTAGCTGGAGTACAGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGT
+GAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGACTTACTGGACTGCTACTGGCACTGAGGCACGAAAAGCC
+GTAGGGGAGCGAAACAGGAATTTAGATACCCTGGTAGTCCACGCCGTAAACGATAGAATACTAGGTG
+>107461
+TACGTAGGTGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGTGTGTAGGCGGGTCTGCAAGTCAGATGTGAAACTAT
+GGGCCCAACTCATAGCTTGCATTTGAAACTGTGGATCTTGAGTGTCGGAGAGGTAAATGGAATTCCCGGTGTAGAGGTGA
+AATTCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGTTTACTGGACGATTACTGACGCTGAGACACGAAAGTGTGG
+GGAGCAAACAGGATTAGATACCCTGGTAGTCCACACTGTAAACGATGAATGCTAGGTGTAGGGGGCGATTGAGCTTCTGT
+GCCGCAGTTAACACAATAAGCATTCCACTGGGGAGTACGGCCGCAAGGTTGAAACTCAAAGGAATTGACGGGAGCCCGCA
+CAAGCAGTGGAGTATGTGGTTTAATTCGACGCAACGCGAAGAACCTTACCGGAACTT
+>98457
+TAACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTAGTAAAGGGTGCGTAGGTGGCAAGGCAAGTCTGAAGTGAAAAT
+CCGGGGCTCAACCCCGGAACTGCTTTGGAAACTGTTTAAGCTTAGAGTACAGGAGAGGTAAGTGGAATTCCTAGTGTAGC
+GGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGACTTACTGGACTGGTACTGACACTGAGGCACGAAAG
+CGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTTGGGGCCCAAAGGGCT
+TCGGTCGCCGCCGCAAACCGCA
+>108870
+TACGTATGGTGCAAGCGTTATCCGGTATTTACTTGGGGTGTAAAGGGGAGCGCAGGCGGTTGCGGCAAGTCTGATGTGAA
+AGCCCGGGGACTCAACCCCGGTACTGCATTGGAAACTGTCGTACTAGAGTGTCGGAGGGGTAAGCGGAATTCCTAGTGTA
+GCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACGTGGACGATAACTGACGCTGA
+>114978
+GACTGATTCAGGAAGCCGCCTGCGGCAGAAAATTCTGCAGCGCGTCATGAACTACGCACAGAACAACTGTTCCGACGCTC
+TGCATGGCAAACTGAAAATTTATCCTGACGCTTTCACGGGTATTGCACTCGATATCCTGGCAAAGCAGGATTACCGCAAT
+ATTCTCACCGACCCGATTCATGCCGGCATGGTCAGCATTCTCAATACGGACAAACAGAGCATTCTCGACAGTCTGCTTCG
+TTCCAATATTCCACTGAATACACCCATGTGGATTACCGTCCGAGCCGGTAAGCAGGCTAACAGTGTCATGTCCGGTATGA
+CACTGGAGACATGGGAATGCGGCCTCAAGTTCAAGCCCAAT
+>150577
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGTGAAGCAAGTCTGAAGTGAAAGGTT
+GGGGCTCAACCCCGAAACTGCTTTGGAAACTGTTTAACTGGAGTACAGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGTAGGGAACGAACCACGTGGGCGGAAGGACGGGCGTTA
+>545503
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGATGGATGTTTAAGTCAGTTGTGAAAGTTT
+GCGGCTCAACCGTAAAATTGCAGTTGATACTGGATATCTTGAGTGCAGTTGAGGCAGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGATTCGCGAAGGCAGACCTAGCTAAGCTGC
+>100870
+TACGTAGGGAGCAAGCGTTGTCCGGATTTACTGGGGTGTAAAGGGTGCGTAGGGCGGCTTTTGCAAGTCAGATGTGAAAT
+CTATAGGGCTCAACCCATAAACTGCATTTGAAACTGTAGAGCTTGAGTGAAGTAGAGGCAGGCGGAATTCCCCGTGTAGC
+GGTGAAATGCGTAGAGATGGGGA
+>243335
+TACGTAGGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGTTTTGCAAGTCTGAAGTGAAAGCC
+CGGGGCTTAACCCCGGGACTGCTTTGGAAACTGTAGGACTAGAGTGCAGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGT
+GAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACTGTAACTGACGTTGAGGCTCGAAAGCGT
+GGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGATTACTAGGTGTTGGTGGGTATGACCTATCG
+GTGCCGCAGCAAACGCAATAAGTAATCCACCTGGGGAGTACGTTCGCAAGAATAGAAAACTCAAAAGGAATTGGACCGGG
+GACCGCACAAGCGGTGGAGCATGTTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCTGGTCTTGACA
+>93537
+TACGGAGGGTGCGAGCGTTGTCCGGAATCATTGGGCGTAAAGAGTTCGTAGGTGGCCTGTTAAGTCTGGTGTTAAATGCA
+GATGCTCAACATCTGTTCGGCACTGGATACTGGCAAGCTTGAATGCGGTAGAGGTAAAGGGAATTCCTGGTGTAGCGGTG
+AAATGCGTAGATATCAGGAGGAACATCGGTGGCGTAAGCGCTTTACTGGGCCGTAATTGACACTGAGGAACGAAAGCCAG
+GGTAGCAAATGGGATTAGATACCCCAGTAGTCCTGGCCGTAAACGATGGATACTAGGTGTTGCGGGTATCGACCCCTGCA
+GTGCCGAAGCAAACGCGACTAAGTATCCGCCCTGGGAGGTACGCACGACAGTAGTGAAACTACAAAGG
+>265749
+TACGTAGGTGGCGAGCGTTTATCCGGTAATCGAATTGGGCGTAAAGAGGGAGCAGGCGGCCGCAAGGGTCTGTGGTGAAA
+GACCGAAAGCTAAACTTCGGTAAGCCATGGAAACCGGGCGGCTAGAGTGCGGAAGAGGATCGTGGAATTCCATGTGTAGC
+GGTGAAATGCGTAGATATATGGAGGAACACCAGTGGCGAAGGCGACGGTCTGGGCCGCAACTGACGCTCATTCCCGAAGC
+GTGGGGAGCAAATAGG
+>534609
+TACGTAGGGGGCGAGCGTTGTCCGGAATGATTGGGCGTAAAGGGCGCGTAGGCGGCCTGCTAAGTCTGGAGTGAAAGTCC
+TGCTTTCAAGGTGGGAATTGCTTTGGATACTGGTGGGCTGGAGTGCAGGAGAGGAAAGCGGAATTACCGGTGTAGCGGTG
+GAATGCGTAGAGATCGGTAGGAACACCAGTGGCGAAGGCGGCTTTCTGGACTGAAACTGACGCTGAGGCGCGAAAGCGTG
+GGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCCTAAACGATGTCTACTGGTTGTTGGGGTTTTTTAACCTTTGTA
+ACGAAGCTAACGCGTGAAGTAGACCGCCTGGGGAGTACGGTCGCAAGATTAAAACTCAAAGGAATTGACGGGGACCCGCA
+CAAGCGGTGGATGATGTGGATTAATTCGATGCAACGCGAAAAACCTTACCTAGCCTTGACATGCCAGGAATCCTGAAGAG
+ATTCGGGAGTGCCCGCAAGGGAATCTGGACACAGGTGC
+>263307
+GTACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCGGACGGCTTAAGTCAGTTGTGAAAGT
+TTGCGGCTCAACCGTAAAATTGCAGTTGATACTGGGTGTCTTGAGTACAGTAGAGGCAGGCGGAATTCGTGGTGTAGCGG
+TGAGATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCCTGCTGGACTGTAACTGACGCTGATGCTCGAAAGTG
+TGGGTATCAAACAGGATTAGATACCCTGGTAGTCCACACAGTAAACGATGAATACTCGCTGTTTGCGATATACAGTAAGC
+GGCCAAGCGAAAAGCGTTAAGTATTCCACCTGGGGAGTACGCCGGCAACGGTGAAACTCAAAGGAATTGACGGGGGCCCG
+CACAAGCGGAGGAACATGTGGTTTAATTCGATGATACGCGAGGAACCTTACCCGGGCTTGAATTGCAACTGAATGATGTG
+AGACATGTCAGCCGCAAGGC
+>344513
+TACGGAAGGTCCGGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCCGGAGATTAAGCGTGTTGTGAAATGTA
+GATGCTCAACATCTGCACTGCAGCGCGAACTGGTTTCCTTGAGTACGCATAAAGTGGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTCACTGGGGCGCAACTGACGCTGAAGCTCGAAAGCGCG
+GGTATACGACGAA
+>5552
+TACGTAGGGGGCGAGCGTTGTCCGGAATCACTGGGCGTAAAGGGCGCGTAGGCGGTATAATAAGTCAGTGGTGAAAACTT
+GGGGCTCAACCCCGAGCCTGCCACTGATACTGTTAGACTTGAGTATGGAAGAGGAGAATGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGATTCTCTGGGCCAAGACTGACGCTGAGGCGCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGGATACTAGGTGTTAGGAGTTTCGATGCTTCTA
+GTGCCGGAGTAAACACAATAAGTACTCCCGCCTGGGGGAGTACGGTCGCAAGACTGAAACTCAAGGAATTGGACGGGGGC
+CCGCACAAGCGGTGGAGCATGTGGTTTAATTCGACGCAACGCGAACGAACCTTAACCAAGACTTGACACTTCCCTTTGAC
+TAGGATA
+>561842
+TACGTAGGGAGCGAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGTAGGCGGGATCGCAAGTCAGATGTGAAAACTA
+TGGGCTTAACCCATAAACTGCATTTGAAACTGTGGTTCTTGAGTGAAGTAGAGGTAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGACCACTGACGCTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTTGGCAGGTAAGACCTGTCGG
+TGCCGCAGCTAACGCAATAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGGACCCG
+CACAAGCGGTGGAGCATGTGGTTTAATCGAAGCAACGCGAAGAACCTTACCAAGTCTTGACATCCCGATGACAGAGTATG
+TAATGTACTTTCTCTTTCGGAGTCATCGGTGACA
+>91532
+TACGTAGGTGGCGAGCGTTATCCGGAATCATTGGGCGTAAAGAGGGAGCAGGCGGCCGCAAGGGTCTGTGGTGAAAGACC
+GAAGCTAAACTTCGGTAAGCCATGGAAACCGGGCGGCTAGAGTGCGGAAGAGGATCGTGGAATTCCATGTGTAGCGGTGA
+AATGCGTAGATATATGGAGGAACACCAGTGGCGAAGGCGACGGTCCCCGTGGGCCGCAACTGACGCTCACTTCCCGAAAG
+CGGTGGGGAGCAAATAGGATTAGATACCCTAGTAGTCCACGACCGTAAACGATGGTCACTAAGTGGTCGGGGGTACCAAA
+CCCCGGTGCTGCAGTCAACGCAATAAGTCGACCCGCCTG
+>340730
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGTGCGTAGGTGGCAAGGCAAGTCTGAAGTGAAAATCC
+GGGGCTCAACCCCGGAACTGCTTTGGAAACTGTTTAGCTGGAGTACAGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGACTTACTGGACTGCTACTGACACTGGGGCACGAAAGCGTG
+GGGAGCAAGCAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGCTGGGTTCCAAAGGAACTCGG
+TGCCGTCGCAAACGCATTAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGGACCCG
+CACAAGCGGTGGAGCATGTGGTTTAATTCGACGCAACGCGAAGAACCTTACCAGGTCTTGACATCCAACTTAAACTTACA
+GAGATGTAAGGTGTGCTTGC
+>369734
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGACTGGCAAGTCTGATGTGAAGGCGG
+GGGCTCAACCCCGGTACTGCATTGGAAACTGGTCATCTAGAGTGTCGGAGGGTAAGTGGAATTCCTAGTGTAGCGGTGAA
+ATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGATAACTGACGCTGAGGCTCGAAAGCGTGGG
+GAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAA
+>210865
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGGTGGTACTGCAAGTCAGAAGTGAAAGCCC
+CGTGCTTAACGTGGGGACTGCTTTTGAAACTGTGGGACTAGAGTGCAGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCGGTGGCGAAGGCGGCTTACTGGACTGTAACTGACACTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATGAATACTAGGTGTTGGGGAGCGAAGCTCCTCGG
+TGCCGTCGCAAACGCAATAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGGACCCG
+CACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAAGTCTTGACATCTGGAT
+>588604
+TACGTATGGAGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGTGTAGGTGGCCATGCAAGTCAGAAGTGAAAATCC
+GGGGCTCAACCCCGGAACTGCTTTTGAAACTGTAAGGCTAGAGTGCAGGAGGGGCGAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGCGAGGCGCTCACTGGACTGTAACTGACACTGAGCT
+>152931
+TACGTAGGGAGCGAGCGTTGTCCGGAATTACTGGGCGTAAAGGGTGCGTAGGCGGCCTGGTAAGTCAGATGTGAAATACC
+CGTGCTCAACATGGGGGGTGCATCTGATACTGTTAGGCTTGAGTGCAGGAGAGGAAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCCAAGGCACACAGGGGATAGG
+>548884
+TACGTTGGTGGCGAGCGTTATCCGGAATCATTGGGCGTAAAGAGGGAGCAGGCGGCCGCAAGGGTCTGTGGTGAAAGACC
+GAAGCTAAACTTCGGTAAGCCATGGAAACCGGGCGGCTGGAGTGCGGAAGAGGATCGTGGAATTCCATGTGTAGCGGTGA
+AATGCGTAGATATATGGAGGAACACCAGTGGCGAAGGCGACGGTCTGGGCCGCAACTGACGCTCATTCCCGAAAGCGTGG
+GGAGCAAATAGGATTAGGACCTAC
+>588899
+TAAGGAAAAGAAGCGCGGAAATCGCGACTTTATTTCGACAGCGACGTCATCCGCAACTTTATGCGCGGCAAAAGCCCGAA
+ACACTTGCAAAATAGGATATTTTTGATACAATAAACAAAACCAAAAAGAAGGCTTACAATGGAAAAATACAAAATGACCA
+CTCCGCTTGTCGAAATGGACGGCGATGAAATGACGCGTATTCTGTGGGCGGACATCAAAGAACTGCTGCTCACTCCTTAC
+ATCGACCTTAATACCGAATATTACGATCTCGGTCTTCGTCACCGTGACGACACCGACGACAAAGTCACCTTAGACGCCGC
+GTACGCCACGAAAAATACGGCGTTGCGGTCAAGTGCGCTACGATCACC
+>328739
+TACGGAAGGTCCGGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCCGTTTGTTAAGCGTGTTGTGAAATGTC
+GGGGCTCAACCTGGGCATTGCAGCGCGAACTGGCAGACTTGAGTGCGCGGGAAGTAGGCGGAATTCGTCGTGTAGCGGTG
+AAATGCTTAGATATGACGAAGAACTCCGATTGCGAAGGCAGCTCACTGGAGCGCAACTGGCGCTGAAGCTCGAAAGTGCG
+GGTATCGAACAGGATTAGATACCCTGGTAGTCCGCACGGTAAACGATGGATGCCCGCTGTTGGTCTGAATAGGTCAGCGG
+CCAAGCGAAAGCATTAAGCATCCCACTGGGGATACGCCGGCAACGGTGAAACGTCAAAGGAATTGACGGGGCCCGCGACA
+AGCGGAGGAACATGTGGTTAATTCGATGATACGCGAGGAACCTTACCCGGG
+>112752
+TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTATAAAGGGAGCGCAGGCGGGAGAGCAAGTCAGCGGTGAAATACA
+TGGGCTTAACCCATGAACTGCCGTTGAAACTGTTTTTCTTGAGTGAAGTAGAGGCAGGCGGAATTCCGAGTGTAGCGGTG
+AAATGCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCGGCCTGCTGGGCGTTTTACTTGACGCTGAGGCTCGAAGACG
+TGGGGAGGCAAACAAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATGATTGCTAGGTGT
+>144065
+AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAAACCA
+TGGGCTCAACCCATGAATTGCTTTCAAAACTGTTTTTCTTGAGTAGTGCAGAGGTAGATGGAATTCCCGGTGTAGCGGTG
+AATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGTCTACTGGGCACCAACTGACGCTGAGGCTC
+>509511
+TACGTAGGGGGCAAGCGTTATCCGGAATTACTGGGTGTAAAGGGTGCGTAGGTGGTATGGCAAGTCAGAAGTGAAAACCC
+AGGGCTTAACTCTGGGACTGCTTTTGAAACTGTCAGACTGGAGTGCAGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACATCAGTGGCGAAGGCGGCTTACTGGACTGAAACTGACACTGAGGCACGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGGCCGTAGAGGCTTCGG
+TGCCGCAGCCAACGCAGTAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGGACCCG
+CACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCTGGTCTTGACATCCTT
+>266208
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGATGGCATGGCAAGTCTGAAGTGAAAGCCC
+GGGGCTTAACCCCGGGACTGCTTTGGAAACTGTTAAGCTAGAGTGCAGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCGGTGGCGAAGGCGGCTTACTGGACTGTAACTGACATTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTTGGGGAGCATAAGCTCTTCG
+GTGCCGGCGCAAACGCATTAAGTATTCCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAAGGATTGACGGGGACC
+CGCACAAGCGGTGGAGCATGTGGTTTAATCGACGCAACGCGAAGACCTTACCAAGTCTTGACATCCCGTGACCGGTCAGT
+AACGTGTACCTTTTTCTC
+>240686
+TACGTATGGAGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGTGTAGGTGGTATCACAAGTCAGAAGTGAAAGCCC
+GGGGCTCAACCCCGGGACTGCTTTTGAAACTGTGGAACTGGAGTGCAGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACTGTAACTGACACTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGGCTCATAAGAGCTTCG
+GTGCCGCAGCAAACGCAATAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAAGGAATTGGACCGGGGA
+CCCGCGACAAGCGGTGGAGCATGTGGTCTAATCGAAGCAACGCGAAGAACCTTACCAAGTCTTAGACATCCTTCTGACCG
+GACAGTAATGT
+>339472
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGAGCAGCAAGTCTGATGTGAAAGGCG
+GGGGCTCAACCCCCGGACTGCATTGGAAACTGTTGATCTTGAGTACCGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGGTAACTGACGTTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAAGATACCTTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGTGGCAGGGCCATTCG
+GTGCCGCAGCAAACGCAGTAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGGACCC
+GGCACAAGCGGTGGAGCGATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAAGTCTTGACATACCCTCTGACCGG
+TGAGTAACGTCACCCTTCCTTCGGG
+>78316
+TACGTAGGGAGCGAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGTAGGCGGGATCTTAAGTCAGGTGTGAAAACTA
+TGGGCTCAACCCATAGACTGCACTTGAAACTGAGGTTCTTGAGTGAAGTAGAGGCAGGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACATCAGTGGCGAAGGCGGCCTGCTGGGCTTTTACTGACGCTGAGGCTCGAAAGCGTG
+GGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATGAATACTAGGTGTAGGGGTTGTCATGACCTCTGT
+GCCGCCGCTAACGCATTAAGTATTCCGCCTGGGGAGTACGGTCGCAAGATTAAAACTCAAAGGAATTGACGGGGGCCCGC
+ACAAGCAGCGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACTAGACTTGACATCTCCTGCATTACTCTTAA
+TCGAGGAAGTCCC
+>570817
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGCGGTGCGGCAAGTCTGATGTGAAAGCCC
+GGGGCTCAACCCCGGTACTGCATTGGAAACTGTCGTACTAGAGTGTCGGAGGGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGATTACTGGACGACAACTGACGCTGAAACACGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATGCTAGGTGTAGGCAAGAAATTGTCTGTGC
+CGGAGTAAACACAATAAGCATTCCACCTGGGGATACGGCCGCAAGGTTGAAACTCAAAGGAATTGACGGGAGCCCGCACA
+AGCAGTGGAGTATGTGGTTTAATCGACGCAACGCGAAGAACCTTA
+>487725
+TACGTAGGGAGCGAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGTAGGCGGGATCGCAAGTCAGATGTGAAAACTA
+TGGGCTTAACCCATAAACTGCATTTGAAACTGTGGTTCTTGAGTGAAGTAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACATCAGTGGCGAAGGCGGCTTACTGGGCTTTAACTGACGCTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTTGGGGAGCATAAGCTCTTCG
+GTGCCGGCGCAAACGCATTAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGGACCC
+GCACAAGCGGTGGAGCATGTGGTTTAATTCGACGCAACGCGAAGAACCTTACCAAGTCTTGACATCCCGGGTGACCCGGG
+ATGGTAACGCATCCCTTTTTCTTCGGAACACCGGTGACAGGTGGTGCATT
+>254841
+TACGTAGGGTGGCAAGCGTTGTCCGGAATTATTGGGACGGTAAAGCGCGCGCAGGCGGCTTCTTAAGTCCATACCTTAAA
+AAGGTGCGGGGCTTAACCCCGGTGATGGGTATGGAAACTGGGAGGCTGGAGTATCGGAGAGGAAAGTGGAATTCCTAGTG
+TAGCGGTGAAATAGCGTAGAGATTAGG
+>511687
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCGGACTATTAAGTCAGCTGTGAAAGTTT
+GCGGCTCAACCGTAAAATTGCAGTTGATACTGGTCGTCTTGAGTGCAGTAGGAGGTAGGCGGAATTCGTGGTGTAGCGGT
+GAAATGCTTAGATATTACGAAGAACTCCGATTGCGAAGGCAGCTTACTGGACTGTAACTGACGCTGATGCTCGAAAGTGT
+GGGTATACAAACAGGATTAGA
+>210791
+TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGCAGCCGGGCAGGCAAGTCAGATGTGAAATCTG
+GAGGCTTAACCTCCAAACTGCATTTGAAACTGTCTGTCTTGAGTATCGGAGAGGTAATCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAAGAACACCAGTGGCGAAGGCGGATTACTGGACGACAACTGACGGTGAGGCG
+>114292
+TACGGAAGGTCCGGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGCAGGCCGTCCTTTAAGCGTGCTGTGAAATGCC
+GCGGCTCAACCGTGGCACTGCAGCGCGAACTGGAGGACTTGAGTACGCACGAGGTAGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTTACCGGAGCGCAACTGACGCTGAGGCTCGAAAGCGCG
+GGTATCGAACAGGATTAGATACCCTGGTAGTCCGGCGCGGTAAACGATGGATGCCCGCCGTTGGGATTTGGATTTCAGCG
+GCCAAGCGAAAGCGTTAAGCACTCCCACCTGGGGAGTACGCCGGCAACGGTGAAACTC
+>114029
+TACGGAAGGTCCGGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCCGGAGATTAAGCGTGTTGTGAAATGTA
+GTGGCTCAACCTCTGCACTGCAGCGCGAACTGGTCTTCTTGAGTACGCACAACGTGGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTCACGGGAGCGCAACTGACGCTGAAGCTCGAAAGTGCG
+GGTATCGAACAGGATTAGATACCCTGGTATTCCACGCCGTAAACGATGATGGCTAACCGCCGGCGACACACTGTCGGTGG
+CCAAGCGAAAGCGATAAGCCATCCACCTGGGGAGTACGTCGGCACGATGAAACTCAAAGGAATTGGAC
+>540283
+TACGGAGGATCCAAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGTTTGATAAGTTAGAGGTGAAATACC
+GGGGCTCAACTCCGGAACTGCCTCTAATACTGTTGAACTAGAGAGTAGTTGCGGTAGGCGGAATGTATGGTGTAGCGGTG
+AAATGCTTAGAGATCATACAGAACACCGATTGCGAAGGCAGCTTACCAAACTATATCTGACGTTGAGGCACGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCAGTAAACGATGATAACTCGTTGTCGGCGATACACAGTCGGTGA
+CTAAGCGAAAGCGATAAGTATCCACCGTGGGGAGTACGTCGCAAAGAATGAAACTCAAAGGAATTGGACGGGGGCCCGCA
+CAAGCGGAGGAACATGT
+>584265
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCTGTGTAAGTCTGAAGTGAAAGCCC
+GGGGCTCAACCCCGGGACTGCTTTGGAAACTATGCAGCTAGAGTGTCGGAGAGGTAAGTGGAATTCCCAGTGTAGCGGTG
+AAATGCGTAGATATTGGGAGGAACACCAGTGGCGAAGGCGGCCTGCTGGGCTTTAACTGACGCTGAGGCACGAAAGCGTG
+GGTAGCAAACAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATGATTACTAGGTGTGGGGGGTCTGACCCTTCCGT
+GCCGGAGTTAACACAATAAGTAATCCACCTGGGGAGTACGGCCGCAAGGTTGAAACTCAAAGGAATTGACGGGGGCCCGC
+GACAAGCAGTGGAGTATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAGGTCTTGACATCCTACTAACGAGATA
+>162529
+TACGGAGGATGCAAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGCACGCCAAGTCAGCGGTGAAATTTC
+CGGGCTCAACCCGGACTGTGCCGTTGAAACTGGCGAGCTAGAGTACACAAGAGGCAGGCGGAATGCGTGGTGTAGCGGTG
+AAATGCATAGATATCACGCAGAACCCCGATTGCGAAGGCAGCCTGCTAGGGTGAAACTGACGCTGAGGCGCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATGAATACTAGGTGTGGGAGGACTGACCCCTTCGT
+GCCGCAGTTAACACAATAAGTATTCCACCTGGGGAGTACGACCGCAAGGTTGAAACTCAAAGGAATTGACGGGGCCCGCG
+ACAAGCAGTGGATTATGTGGTTTAATTCGACGCAACGCGAAGAACCTTACCAGG
+>103639
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGTGTGGCAAGTCTGATGTGAAAGGCA
+TGGGCTCAACCTGTGGACTGCATTGGAAACTGTCATACTTGAGTGCCGGAGGGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGGTAACTGACGTTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGGAGCATGGCTCTTCGG
+TGCCGTCGCAAACGCAGTAAGTATTCCACCTGGGGATACGTTCGCAAGAATGAAACTCAAAGG
+>314793
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGACTGGCTAAGTCTGATGTGAAAGGC
+GGGGGCTCAACCCCTGGACTGCATTGGAAACTGTTAGTACTTGAGTGCCGGAGAGGTAAGCGGAATTCCTAGTGTAGCGG
+TGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGGTAACAGACGTTGAGGCTCGAAAGCG
+TGGGGAGCAAACAGGATTAG
+>134929
+ATACGGAGGATGCGAGCGTTATTCCGGATTTATTGGGTTTAAAGGGAGCGCAGACGGGAGATTAAGTCAGTTGTGAAAGT
+TTGCGGCTCAACCGTAAATTGCAGTTGATACTTGGTTTCCTTGAGTGCAGTTGAGGCAGGCGGAATTCGTGGTGTAGCGG
+TGAAATGCTTAGATATCACGAAGACCCCGATTGCGAAGGCAGCTTGCTAAACTGTAACTGACGTTTCATG
+>571389
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGATGGATGTTTAAGTCAGTTGTGAAAGTTT
+GCGGCTCAGCCGTAAAATTGCAGTTGATACTGGATATCTTGAGTGCAGTTGAGGCAGGCGGAATTCGTGGTGTAGGCGGT
+AGAAATGCTTAGA
+>10857
+TACACCCAAACAAATCACCAACCATACCAAACATCCGGTACGCTTCCTGAGTGCCGCCTCCAATGGAACATTATGCTATG
+GATATGATGGCGAAATCTACACGGTAAAAGAAGATGCCATTCCGCAAAAAGTTCAGATTTCCATTGTCACCGACAAGAAC
+GACAAAGATCTCATCCGCCAGATCAAGCGTAGCGGAGCAACAGAGATATCCCTTTCGCCCGATGCAAAGGAAATTGCTTC
+TGTACTACGCGGAGATGTATATGTCACATCAACAGAATACAGCACTACCAAACAAATCACCAATACTCCGCAACAGGAAC
+GCGATATTCACTTTTCTCCGGACGGAAGAAGTATTGTTTACGCTTCCGAAAGAAACGGATTATGGCAGATTTTATCAAAC
+CAGTCTAGCTAAAAAGGAAGAAAAACAGTTCGCTTACGCAACCGATATCAAAGAGGAAAGACTGACCAATTCGGATATCA
+C
+>77467
+TACGTAGGGAGCGAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGTAGGCGGGATCGCAAGTCAGATGTAGAAAACT
+ATGGGCTTAAACCCAATAAAACTGCATTTGAAAACTGTGGTTCTTGAGTGAAGTAGAGGTAAGCGGAATTCCTAGTGTAA
+GCGGTGAAATGCGTAGATATTAGGAGGAACATCAGTGGCGAAGGCGGCTTACTGGGCTTTAAACTGACGCTG
+>511558
+TACGTAGGGAGCGAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGTAGGCGGGACGGCAAGTCAGATGTGAAATACA
+TGGGCTCAACCCATGGGCTGCATTTGAAACTGCTGTTCTTGAGTGAAGTAGAGGTAAGCGGAATTCCTGGTGTAGCGGTG
+AAATGCGTAGAGATGGGGAGAACACCAGTGGCGAAGGCGGCCTGCTGGGCTTTAACTGACGCTGAGGCACGAAAGCGTGG
+GTAGCAAACAGGATTAGATACCCTGGTAGTCCAACGCTGTAAACGATGATTACTAGGTGTGGGGGTCTGACCCCTTCCGT
+GCCGGAGTTAAC
+>589130
+TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGCGATGCAAGTCAGCGGTAAAAGGCC
+GGGCGTCAACCCGGCGAGCCGTTGAAACTGCAGTGCTAGAGAAGGCGAGAGGTACGCGGAATGCACAGTGTAGCGGTGAA
+ATGCTTAGATATTGCGCAGAACTCCGATTGCGAAGGCAGCGTACTGGCGCCTGACTGACGCTGAGGCACGAAAGCGTGGG
+GATCGAACAGGATTAGATACCCTGGTAGTCCACGCAGTAAACGATGAATGCCGGCTGTTCGGGTTGATTGAGACCTGAGC
+GGCGAAGCGAAAGCGATAAGCATTCCACCTGGGGAGTACGCCGGCAACGGTGAAACTCAAAGGAATTGACGGGGGCCCGC
+ACAAGCGGAGGAACATGTGGTTTAATTCGATGATACGCGAGGAACCTTACCCGGGCTTAACTA
+>218005
+TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGTAGCCGGGAGGGCAAGTCAGATGTGAAATCCA
+CGGGCTCAACTCGTGAACTGCATTTGAAACTACTCTTCTTGAGTATCGGAGAGGCAATCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGTCAACTGGACGATAACTGACGCTGAGGCGCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTGGGGGGACTGACCCCTTCCG
+TGCCGCAGTTAACACAATAAGTATTCCACCTGGGGAGTACGACCGCAAGGTTGAAACTCAAAGGAATTGACGGGGGCCCG
+CACAAGCAGTGGATATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAGGGCTTGACAT
+>241853
+AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAAACCA
+TGGGCTCAACCCATGAATTGCTTTCAAAACTGTTTTCTTGAGTAGTGCAGAGGTAGATGGAATTCCCGGTGTAGCGGTGG
+AATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGTCTACTGGGCACCAACTGACGCTG
+>568881
+TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGCGTAGGCGGGATGGCAAGTCAGATGTGAAATCCA
+TGGGCTCAACCCATGAACTGCATTTGAAACTGTCGTTCTTGAGTATCGGAGAGGCAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTGCTGGACGACAACTGACGCTGAGGCGCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTGGGGGGACTGACCCCCTCCG
+TGCCGCAGTTAACACAATAAGTATTCCACCTGGGGAGTACGGCCGCAAGGTTGAAACTCAAAGGAATTGACGGGGCCCGC
+ACAAGCGGAGGAACATGTGGTTTAATTCGATGATACGCGAGGAACCTTACCCGGGCTTAAATTGCAGATGAATTACGGTG
+AAAGCCGTAAAGCCGCAAGGCATCTGT
+>136925
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGGCGGCGGAGCAAGTCAGAAGTGAAAGCCC
+GGGGCTCAACCCCGGGACGGCTTTTGAAACTGCCCTGCTTGATTTCAGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACTGACAATGACGCTGAGGCTCGAAAAGCGT
+GGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGAGCCAAAAGGCTTTC
+GGTGCCGCAGCAAACGCAATAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGGACC
+CGCACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAGGTCTTGACATCTGGCTAACGAAGT
+AGAGATACA
+>165369
+GCAGGAAATTATACCATAGCCACATATTAGAAGCCAGTTCCGTGTTATCCCATGCAAAACCTCCGATGTCATAACGCCAG
+GTATGGCGGACAGGATCGTATGCATGCATCACATCCCCATAATTCCAGAAACCATACCATTTGTTTTGTTCTATCGCTTT
+TTGATAGAAGCTGATATAAGCATCCAGCCTGTCTTCCACCCGGGCACGGAAAGGAGTACTACGATCGGGAAGGCTCCAAA
+CACCAAACGCCTGCTTGGCATGCAAATAGTCAGGTACAGGCATC
+>236118
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGATAAGCAAGTCTGGAGTGAAAGCCC
+GGGGCTCAACCCCGGGACTGCTTTGGAAACTGTTAATCTAGAGTGCTGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACAGTGACTGACGTTGAGGCTCGAAAGCGTG
+GGGA
+>237963
+TACGTATGGAGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGTGTAGGTGGCCATGCAAGTCAAGAAAGTAAAAAG
+TCCGGGGCTCCAACCCCGGAACTGCTTTTGAAACTGTAAGGCTAGAGTGCGAGGAGGGGTGAGTGGAATTCCTAGTGTAG
+CGGTAGAAATGCGTAGATAGTAGG
+>204177
+GTATTCCACCTGGGGAGTACGCTCGCAAGAATGAAACTCAAAGGAATTGACGGGGACCCGCACAAGCGGTGGAGCATGTG
+GTTTAATTCGAAGCAACGCGAAGAACCTTACCAAATCTTGACATCGATCCGACGGACCGTAATGGGTCCTTTCCTTCGGG
+GACGGAGAAGACAGGTGGTGCATGGTTGTCGTCAGGCTCGTGTCGTGAGATG
+>332464
+TACGTAGGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGTGTAGGTGGCTGTGCAAGTCAGGAGTGAAAGCC
+CGGGGCTCAACCCCGGGACTGCTCTTGAAACTGTGCGGCTTGAGTGCAGGAGGGGCAGGCGGAATTCCTAGTGTAGCGGT
+GAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGATAACTGACGCTGAGGCTCGAAAGCGT
+GGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGGAGCATTGCTCTTCG
+GTGCCGCAGCAAACGCAATAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGATTGGACGGGGACCC
+GCACAAGCGGTGGAGCATGTGGTTTAATCGAAGCAACGCGAAG
+>303295
+TACGTAGGGAGCGAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGTAGGCGGGATCGCAAGTCAGATGTGAAAACTA
+TGGGCTTAACCCATAAACTGCATTTGAAACTGTGGTTCTTGAGTGAAGTAGAGGTAAGCGGAACTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACATCAGTGGCGAAGGCGGCTTACTGGGCTTTAACTGACGCTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGATTACTAGGTGGTGGGGGGACTGACCCCTTCC
+CGTACCGCAGCAAACGCAAATAAGTAATCCACCGTGGGGAGTACGACCGGCAAGGTTAGAAACTACGAAAGGAA
+>76933
+GGAAGAACTTCTGGCTAAGGAACAAGTGACTGCATACATTGGTTTCGACCCTACTGCGGACTCTTTGCACATCGGGCACC
+TTTGCAGTGTGATGATTTTGCGTCACTTCCAGCGTTGCGGACACAAGCCGTTGGCACTGATTGGCGGTGCTACGGGTATG
+ATTGGTGACCCTTCCGGAAAGTCTGCCGAACGCAATCTGCTTACGGAAGAGACGTTGCAGCGCAACTTGGCCGGTATGAA
+GGCCCAGCTCTCCAAGTTCCTGGATTTTGATTCGGATGCCCCCAACCGTGCCGAGTTGGTGAACAACTACGATTGGATGA
+AGAACTTCACTTTCCTCGATTTTGCCCGTGAAGTGGGTAAGCACATCACCGTGAACTATATGATGGCGAAAGATTCTGTA
+AAGAAGCGTCTGAACGGTGAGGCACGTGACGGTCTTTCCTTCACGGAATTCACCTACCAGCTGTTGCAGGGGTAC
+>565375
+TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGGGTGCGTAGGCGGTGTATTAAGTCTGAAGTAAAATCGT
+GGGGCTCAACCCATCAAGCTTTGGAAACTGGTAGACTAGAGTGCAGTAGAGGCAGATGGAATTCCATGTGTAGACGGTAA
+AATGCGTAGATATAGT
+>244674
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCGGGTGCTTAAGTCAGTTGTGAAAGTTT
+GCGGCTCAACCGTAAAATTGCAGTTGATACTGGGCGCCTTGAGTGCAGCATAGGTAGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGTCAGTCTTACGTGGAC
+>541076
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCGGGGTATTAAGTCAGTTGTGAAAGTTT
+GCGGCTCAACCGTAAAATTGCAGTTGATACTGGTATCCTTGAGTGCAGCAGAGGTGGGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCGGTGGCGAAGGCGACTTTCTGGACAATAACTGACGTTGAGGCACGAAAGTGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACACCGTAAACGATGGATACTAGGTGTAGGAAATGATTTCATTTTCT
+GTGCCGTCGCAAACGCAATAAGTATCCCACCTGGGGATACGGCCGCAAGGTTGAAACTCAAAGGATTGGACGGGGCCCGC
+ACAAGCAGGTGGAGTATGGTGGTTTAATTCGAAGCAAC
+>10660
+TACGTAGGTGGCGAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGCGTAGGGCGGGGATAGCAAGTCAGATGTGAAATC
+CGTGGGCTCAACCCACGAACTGCATTTGAAACTGTTATTCTTGAGTGATGGAGAGGCAAGCGGAATTCCTAGTGTAGCGG
+TGAAATGCGTAGATA
+>261919
+TACGTAGGGAGCAAGCGTTATCCGGATTTATTGGGTGTAAAGGGTGCGTAGACGGGACAACAAGTTAGTTGTGAAATCCC
+TCGGCTTAACTGAGGAACTGCAACTAAAACTATTGTTCTTGAGTGTTGGAGAGGAAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATAAGGAAGAACACCAGTGGCGAAGGCGGTCTACTGGACAGTAACTGACGCTGAGGCGCGAGAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATGAATACTAGGTGTGGGAGGACTGACCCCTTCCG
+TGCCGCAGTTAACACAATAAGTATTCCACCTGGGGAGTACGGCCGCAAGGTTGAAACTCAAAGGAATTGACGGGGGCCCG
+CACAAGCAGTGGATTATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAGGGCTTGACATCCAACTAACGAAGCAG
+AGATGCA
+>143463
+TACGGAAGGTCCGGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCCGGAGATTAAGCGTGTTGTGAAATGTA
+GACGCTCAACGTCTGCACTGCAGCGCGAACTGGTCTTCTTGAGTACGCACAACGTGGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCCCCCCAAGAAGAACTCGATTGCGAAGGCAGCTCACGTGGA
+>522127
+TACGTAGGGGCGAGCGGTTATCCGGATTTATTGGGCGTAAAGCGTGTGTAGGCGGTTTATTAAGTCCGAGATGAAAGGCT
+GAGGCTTAACCTCAGTTTGTTTCGGAAACTGGTAGACTAGAGTGCAGTAGAGGCAATTGGAATTCATAGTGTAGCGGTAA
+AATGCGTAGATATTATGAGGAACATCAGTGGCGAAGGCGAATTGCTGGGCTGTTACTGACGCTGAGACACGAAAGCGTGG
+GAGCAAATAGGATTAGATACCCTAGTAGTCCAACGCCGTAAACGATGAGCACGTAAGTGTCCGGGCAACCGGTGCTGAAG
+TTAACACATTAAGTGCTCCGCCGTGAGTAGTACGGTCGCAAGACTAGAAACTCAAAGGGAACTTG
+>258229
+TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGTAGCCGGGAGGGCAAGTCAGATGTGAAATCCA
+CGGGCTCAACTCGTGAACTGCATTTGAAACTACTCTTCTTGAGTATCGGAGAGGCAATCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGATTGCTGGACGACAACTGACGGTGAGGCACGAAAGTGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACACCGTAAACGATGGATACTAGGTGTAGGGAGCATTAAGTTTCTGT
+GCCGTCGCAAACGCAAATAAGTATCCCACCTGGGGAGTACGGCCGCAAGGTTGAAAACTCAAAGGAAGTTGACGGGGGCC
+CGCGACAAGCAGTGGAGTATGTGGTTTAATTCGACGCAACGCGAAGAACCTTA
+>172221
+TACGTAGGGAGCGAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGTAGGCGGGACTGCAAGTCAGATGTGAAAACTA
+TGGGCTCAACCTGTAGATTGCATTTGAAACTGTGGTTCTTGAGTGAAGTAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACATCGGTGGCGAAGGCGGCTTACTGGGTCTTTTACTGACGCTGAGGCTCGAAAGCGT
+GGGGAGCAAACAGGATTAGATACCCTGGTACGTCCACGCTAGTAAACGATGATTACGTAGGTGGTGGGGG
+>561793
+TACGGAGGGTGCAAGCGTTGTCCGGAATCATTGGGCGTAAAGAGTTCGTAGGCGGCATGTAAAGTCAGGTGTTAAAGGCT
+GAGGCTCAACCTCAGTATGGCACTTGATACTTGCAAGCTAGAATGCGGTAGAGGTAAAGGGAATTCCTGGTGTAGCGGTG
+AAATGCTTAGATATCAGGAGGAACATCGGTGGCGTAAGCGCTTTACTGGGCCGTAATTGACGCTGAGGAACGAAAGCCAG
+GGTAGCAAATGGGATTAGATACCCCAGTAGTCCTGGCCGTAAACGATGAATACTAGGTGTCGGGGGCTCATAAGAGCTTC
+GGTGCCGCAGCAAACGCAATAAGTACTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGACCC
+GCACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGACCTTACCAGTCTTGACATCCTTCTGACCGGACAGT
+AATGT
+>548838
+TACGGAAGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCGGGCTTTTAAGTCAGCGGTCAAATGTC
+GTGGCTCAACCATGTCAAGCCGTTGAAACTGTAAGCCTTGAGTCTGCACAGGGCACATGGAATTCGTGGTGTAGCGGTGA
+AATGCTTAGATATCACGAAGAACTCCGATCGCGAAGGCATTGTGCCGGGGCATAACTGACGCCTAGAGGCGTCGAAAGTG
+CGGGGTATCAAACAGGATTAGATACCCTGGTAGTCCGCACGGTAAACGATGAAGTGACTCGCTATGGGCGATATATTGT
+>330803
+TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGTAGCCGGGAGGGCAAGTCAGATGTGAAATCCA
+CGGGCTTAAACTCGTGAACTGCAATTTGAAACTGTTCTTCTTGAGTATCGGAGAGGCAATCGGAATTCCTAGTGGTAGAC
+GGTGAAATGCGTAGATATTAGGAGGAA
+>88682
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGACTGGCAAGTCTGATGTGAAAGGCG
+GGGGCTCAACCCCTGGACTGCATTGGAAACTGTTAGTCTTGAGTGCCGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGACAGTAACTGACGTTGAGGCACGAAAGTGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACACCGTAAACGATGGATACTAGGTGTAGGGTGTGATAAACATTCTG
+TGCCGTCGCAAACGCAATAAGTATCCCACCTGGGGAGTACGGCCGCAAAGGTTGAAACTCAAAGGAATTGACGGGGGCCC
+GCACAAGCAGTGGAGTATGTGGTTTAATTCGACGCAACGCGAAGAACCTTACCAGGGCTTGACATATACAGGAATATATA
+AGAG
+>148678
+AACGTAGGTCACAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAATCTA
+TGGGCTCAACCCATAAACTGCTTTCAAAACTGTTTTTCTTGAGTAGTGCAGAGGTAGGCGGAATTCCCGGTGTAGCGGTG
+AAATGCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGGCTCTAACTGACGCTGAGGCACGAAAGCATG
+GGTAGCAAACAGGATTAGATACCCTGGTAGTCCATGCTGTAAACGATGAATGCTAGGTGTGGGGGGACTGGACCCCTTCC
+GTGCCGGAGTTAACACAATAAGCATTCCACCTGGGGAGTACGACCGCAAGGTTGAAACTCAAAAGGAATTGACGGGGCCC
+GCGACAAGCAGTGGAGTAGTGTGGTTTAATTCGAAGC
+>469382
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGCCTTTTAAGTTCAGCGGTGAAAGTC
+TGTGGCTCAACCATAGAATTGCCGTTGAAACTGGGGGGCGTTGAGTATGTTTGAGGCAGGCGGAATGCGTGGTGTAGCGG
+TGAAATGCATAGATATCACGCAGAACCCCGATTGCGAAGGCAGCCTGCCAAGCCATTACTGACGCTGATGCACGAAAGCG
+TGGGGATCAAACAGGATTAGATACCCTGGTAGTCCACGCAGTAAACGATGATCACTAGCTGTTTGTGATACACTGTAAGC
+GGCACAGCGAAAAGCGTTAAAGTGACTCCCACCGTGGGGGAGTACC
+>287759
+TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTTGTTAAGTCAGATGTGAAATCCC
+CGGGCTCAACCTGGGAACTGCATCTGATACTGGCAAGCTTGAGTCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTG
+AAATGCGTAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACGAAGACTGACGCTCAGGTGCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGTCGACTTGGAGGTTGTGCCCTTGAGGCGTGGC
+TTCCGGAGCTAACGCGTTAAGTCGACCGCCGTGGGGGGTACGCCGCAAAGGTTAAACTCAAATGAATTGACGGGGCCCGC
+ACAAGCGGTGAGCATGTGGTTTAATCGATGCAACGCGAACGAACCTTTACCGTGGTCTTGAC
+>512157
+TACGTAGGTGGCAAGCGTTGTCCGGATTTATTGGGTGTAAAGGGCGTGCAGCCGGGTCTGCAAGTCAGATGTGAAATCCA
+TGGGCTCAACCCATGAACTGCATTTGAAACTGTAGATCTTGAGTGTCGGAGGGGCAATCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGAACACCAGTGGCGGAAGGCGGATTGCTGGACGATACGTGACGGCGAGGCGACG
+>279132
+TTACTGGGTGCTTAATTTGAAAGACGGCAAACTGCAACAATTGGGCAAAAGCTTGCCGGAAGCTACCTTGATGTTCGCCA
+AGTTCTCGCCGGATGCCAGCCGGGTAGCATATGTCTCCAGAAACAACATTTATGTAGAGAGTCTGGTTGACGGGAAAATC
+AACCAACTAACTCAAGACGGAAATAATGAAATTGTAAACGGAACATTCGACTGGGTATACGAAGAAGAATTCAACTGCCG
+CGACGGTTTCCGCTGGAGTCCCGACGGGCAATACATAGCCTATTTGCAAAGCGATACCCAAGGTACGGGATGGTTTGATA
+TCATCAATAATGTAGATTCCATTTATCCTAAAATCCAACGTTTTCCCTACCCTAAAGCCGGAACAGCAAACTCCGCTGTA
+AAAGTAGGCTATGTATCAGCCGACGGTGGAAACACCACTTGGCTGGCTCTTCCAGTGA
+>521453
+TACGGGGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGCGCGTAGGCGGGACGCCAAGTCAGCGGTAAAAGACT
+GCAGCTAAACTGTAGCACGCCGTTGAAACTGGCGACCTGGAGACGAGACGAGGGAGGCGGAACAAGTGAAGTAGCGGTGA
+AATGCTTAGATATCACTTGGAACCCCGATAGCGAAGGCAGCTTCCCAGGCTCGATCTGACGCTGATGCGCGAGAGCGTGG
+GTAGCGAACAGGATTAGATACCC
+>542003
+AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAAACCA
+TGGGCTCAACCCATGAATTGCTTTCAAAACTGTTTTTCTTGAGTAGTGCAGAGGTAGATGGAATTCCCGGTGTAGCGGTG
+GAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGTCTACTGGGCACCCAATGACGCTGA
+>580835
+TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGTGGTTTCTTAAGTCAGCGGTGAAAGTTT
+GTGGCTTAACCATAAAATTGCCATTGATACTGGGAGACTTGAGTATGTTTGAGGTAGGCGGAATGCGTGGTGTAGCGGTG
+AAATGCATAGATATCACGCAGAACTCCGATTGCGAAGGCAGCTTGCCAAGCCATAAACTGACACTGAAGCACGAAAGCGT
+GGGTATCAAACAGGATTAGATACCCTGGTAGTCCACGCAGTAAACGATGATTACTAGGAGTTTGCGATATAGTGTAAGCT
+CTACAGCGAAAGCGTTAAGAATCCACCTGGGGATACGCCGGCAACGGTGAAACTCAAAGGAATTGACGGGGGCCCGCACA
+AGCGGAGGAACATGTGGTTAATTCGATGATACGCGAGGAACCTTACCCGGGTTTGAACGCACAGCGACGATCAGGAAACT
+GATTTTCTAGCAATAGCGATGT
+>35881
+TACGTAGGGAGCGAGCGTTATCCGGATTTATTGGGTGTAAAGGGTGCGTAGACGGACTTACAAGTTAGTTGTGAAATCCC
+TCGGCTTAACTGAGAAACTGCAACTAAAACTGTAAGCCTTGAGTGCAGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACATCAGTGGCGAAGGCGGCTTACTGGACTGAAACTGACACTGAGGCACGAAAGCGTG
+GGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGCCGTAGAGGCCTCGGTG
+CCGCAGCCAACGCAATAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGGACCCGCA
+CAAGCGGTGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCTGGTCTTGACATCCTTCTGACCGACTTCTTA
+ATCGTAGTTTTTCCTTGGGACAGGAGTGACAGGTGGT
+>262464
+TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAGAGGGAGCGCAGACGGGAGATTAAGTCAGTTGTGAAAGTTT
+GCGGCTCAACCGTAAAATTGCAGTTGATACTGGTTTCCTTGAGTGCAGTTGAGGCAGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCGCGAAGAACCCCGATTGCGAAGGCAGCTTGCTAAACTGTAACTGACGTTCATGCTCGAAAGTGTG
+GGTATCAAACAGGATTAGATACCCTGGTAGTCCACACGGTAAACGATGGATACTCGCTGTTGGCGATATACTGTCAGCGG
+CCAAGCGAAAGCATTAAGTATCCCACCTGGGGATACGCCGGCAACGGTGAAACTCAAAGGAATTGACGGGGCCCGCGACA
+AGCGGAGGAACATGTGGTTTAATTCGATGATACGCGAGGAACCTTACCCGGGCTTAAATTGCAGACGAATTACGAGGAAA
+CTTGTAAGCCGCAAGGCGTCTGTGAAGGTG
+>342042
+TACGTAGGTGGCAAGCGTTGTCCGGAATTATTGGGCGTAAAGCGCGCGCAGGCGGCTTCCCAAGTCCCTCTTAAAAGGTG
+ACGGGGCCTTAACCCCGTGATGGGAAGGAAACTGGGAAGCTGGAGTATCGGAGAGGAAAGTGGAATTCCTAGTGTAGCGG
+TGAAATGCGTAGAGATTAGGAAGAACACCGGTGGCGAAGGCGACTTTCT
+>70461
+TACGGAAGGTCCGGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCCGGAGATTAAGCGTGTTGTGAAATGTA
+GTGGCTCAACCTCTGCACTGCAGCGCGAACTGGTCTTCTTGAGTACGCACAACGTGGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTCACGGGAGCGCAACTGACGCTGAAGCTCGAAAGTGCG
+GGTATCGAACAGGATTAGATACCCTGGTAGTCCGCACGGTAAACGATGGATGCCCGCTGTTGGCCTGAATAGGTCAGCGG
+CCAAGCGAAAGCATTAAGCATCCCACCTGGGGAGCGCCGGCCGACGTACGGTAGAACTAACGAAAGTAGTACGGGCCCGG
+GGCCCGACAACGACGGACGGGAACGGAAC
+>583211
+TACGGAAGGTCCAGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGCAGGCGGACCTTTAAGTCAGCTGTGAAATACG
+GCGGCTCAACCGTCGAACTGCAGTTGATACTGGAGGTCTTGAGTGCACACAGGGATACTGGAATTCATGGTGTAGCGGTG
+AAATGCTCAGATATCATGAAGAACTCCGATCGGCGAAGGCAGGTATCCGGGGCGCAACTGACGCTGAGGCTCGGAAAGTG
+ACGGGT
+>544325
+TACGGAGGATTCGAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGTGTAGGCGGGATTACAAGTCAGATGTGAAATACC
+GGGGCTTAACTCCGGGGCTGCATTTGAAACTGTAGTTCTTGAGTGCCGGAGAGGAAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTTCTGGACGGTAACTGACGCTGAGGCGCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGGATACTAGGTGTAGGAGGTATCGACCCCTTCT
+GTGCCGGAGTTAACACAATAAGTATCCCACCTGGGGAGTACGGCCGCAAGGTTGAAACTCAAAGGAATTGACGGGGGCCG
+CACAAGCAGTGGAGTATGTGGTTTAATTCGACGCAACGCGAAGAACCTTACCAGGGCTTGACATATAAGTGAATAATTAA
+GAGATTAGTTAGCTCTTCGGAGCACTTATACAGGTGGTGCA
+>511119
+ATACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGGAGCGTAGACGGTGTGGCAAGTCTGATGTGAAAGG
+CATGGGCTCAACCTGTGGACTGCATTGGAAACTGTCATACTTGAGTGCCGGAGGGGTAAGCGGAATTCCTAGTGTAGCGG
+TGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGGTAACTGACGTTGAGGCTCGAAAGCG
+TGGGGAGACAAACAGGATTAGATACCCTGTAGTCCACGCCGTAAACGATAGAATACTAGGTGTCGGGGAGCATGGCTCTT
+CGGTGCCGTCGCAAACGCAGTAAGTATT
+>288119
+TACGGTAGGGAGCGAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGTTAGGCGGGATCGGCAAGTCAGATGTGAAAA
+CTATGGGCTTAACCCATAAACTGCATTTGAAACTGTGGTTCTTGAGTGAAGTAGAGGTAAGCGGAATTCCTAAGTGTAGC
+GGTGAAATGCGTAGATATTAGGAGGAACATCAGTGGCGAAGGCGGCTTACTGGGCTTTAACTGAACGCTGGAGGCTACGA
+AAGCGTGGGGAGCAAACAGGATTAGATAACCCTGGTAGTCCACGCCGTAAACGGATGGATTACTAGGTGTGGGGGGACTG
+A
+>552779
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCGGATTGTTAAGTCAGTTGTGAAAGTTT
+GCGGCTCAACCGTAAAATTGCAGTTGATACTGGCAGTCTTGAGTGCAGTAGAGGTGGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTCACTGGAGTGTAACTGACGCTGATGCTCGAAAGTGTG
+GGTATCAAACAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATGAATACTAGGTGTGCGGGGACTGACCCCCCTGC
+GTGCCGCAGTTAACACAATAAGTATTTCCCACCTGGGGATACGATCGCAAGGTTGAAACTCAAAGGAATTGGACGGGGGC
+CCGCACAAGCGGTGGATTA
+>542129
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCGAGGCAAGTCTGATGTGAAAGCCT
+GGGGCTTAACCCCGGAACTGCATTGGAAACTGCTTTGCTGGAGTGCCGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGGTAACTGACGTTGAGGCTACGAAAGCGG
+TGGGGAGACAAAC
+>554250
+TACGTATGGAGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGTGTAGGTGGCATCACAAGTCAGAAGTGAAAGCCC
+GGGGCTCAACCCCGGGATTGCTTTTGAAACTGTGGAGCTGGAGTGCAGGAGAGGCAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTGCTGGACTGTAACTGACACTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCCTAAACGATGTTAACTAGTTGTTGGGATGTAACAATCTCAGT
+AACGCAGCCAACGCGAGAAGCTAACCGCCTGGGAAGCACGGTCGCAAGACTAAAACTCAAAGGAATTGACGGGGACCCGC
+ACAAGCGGTGGATGATGTGGATTAATTCGATGCAACGCGAAAAACCTTACCTACCCTTGACATGTCAGAAGCTCTTGTAA
+TGAGAGCGT
+>287763
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCGGGTGCTTAAGTCAGTTGTGAAAGTTT
+GCGGCTCAACCGTAAAATTGCAGTTGATACTGGGTACCTTGAGTGCAGCATAGGTAGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTTACTGGACTGTAACTGACGCTGATGCTCGAAAGTGTG
+GGTATCAAACAGGATTAGATACCCTGGTAGTCCACACAGTAAACGATGAATACTCGCTGTTGGCGATACACAGTCAGCGG
+CCAAGCGAAAGCATTAAGTATTCCGACCTGGGGATACGCCGGCAACGGTGAAACTCAAAGGAAGTTGG
+>470823
+TACGGAAGGTCCGGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGCAGGCGGACCTTTAAGTCAGCTGTGAAATACG
+GCGGCTCAACCGTCGAACTGCAGTTGATACTGGAGGTCTTGAGTGCACACGAGGGATGCGTAGGAATTCAGTGGTGTAGG
+CGGTAGAAATGCTCAGATATCATAGAA
+>298533
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCGGACGCTTAAGTCAGTTGTGAAAGTTT
+GCGGCTCAACCGTAAAATTGCAGTTGATACTGGGTGTCTTGAGTACAGTAGAGGCAGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCCTGCTGGACTGTAACTGACGCTGATGCTCGAAAGTGTG
+GGGTATCAAACAGGATTAGGATACCCCTGGTAGTCCACACAGGTAAACCGATGAATACTCGCTGTTT
+>245050
+TACGTAGGGTGGCGAGCGTTATCCGGATTTATTGGGCGTAAAGGGTGCGCAGACGGTTTATTAAGTCTAAAATCAAATCT
+TGGGGGACCTCAACCCCCATTCCGTTTTAGAAACTGGTAGACTCGAGTATGGTAGAGGCAAATGGAATTCCTAGTGTAGC
+GGTGGAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGATTTGCTGGGCCATTACTGACGTTCAGGCACGAAAA
+GCGTGGGGAGCAAATAGGATTAGATACCCTAGTAGTCCACGCCGTAAACGAATGAGTACTAAAGTGTCGGGTTACCGGTG
+CTG
+>343453
+TACGTAGGTGGCGAGCGTTATCCGGATTTATTGGGCGTAAAGCGTCCGCAGCCGGTTTATTAAGTCTAGAATTAAAGCCT
+GGAGCTCAACTCCAGTTCGTTTTAGAAACTGATAGACTCGAGTGTGGTAGAGGCAAACGGAATTCCTAGTGTAGCGGTGA
+AATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGACTTACTGGACTGGTACTGACACTGAGGCACGAAAGCGTGG
+GGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTTGGGGCCCAAAGGGCTTCGGT
+GCCGCCGCAAACGCATTAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGGACCCGC
+ACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAAATCTTGACATCCCGATGACCGTTCCGT
+AATGGGAACTTCTCTTTCGGAGCATCGGTGACAGGTGGTGCA
+>539347
+TACGTAGGGGGCGAGCGTTATCCGGATTTATTGGGCGTAAAGCGTGTGTAGGCGGTTTATTAAGTCCGAGATGAAAGGCT
+GAGGCTTAACCTCAGTTTGTTTCGGAAACTGGTAGACTAGAGTGCAGTAGAGGCAGTTGGAATTCATAGTGTAGCGGTAA
+AATGCGTAGATATTATGAGGAACATCAGTGGCGAAGGCGAATTGCTGGGCTGTTACTGACGCTGAGACACGAAAGCGGTG
+GGGAGCAAATAGGATTAGATACCCTAGTAGTCCACGCCGTAAACGATGAGCACTAAGGTGTACCGGGGCAACCCGGTGCT
+GAAGTTAACACATTAAGTGCTCCGCCTGAGTAGTACGGTCGCAAGACTGAAACTCAAAGGATTGACGGCACCCGCACAAG
+CGGTGGAGCATGCCG
+>291739
+TACGGTAGGGGCGAGCGTTATCCGGATTTATTGGGCGTAAAGCGTGTGTAGGCGGTTTGTTAAGTTTAAGATTAAAGCCC
+GGGGCTCAACTCCGGTAAGTCTTAAAAACTGGCAGACTTGAGTACGGTAGAGGCAAACGGAATTTCTAGTGTAGCGGTGA
+AATGCGTAGGATATTAGAAAGGAACACCAAGTGGC
+>361304
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGCGGTACGGCAAGTCTGATGTGAAAGCCC
+GGGGCTCAACCCCGGTACTGCATTGGAAACTGTCGGACTAGAGTGTCGGAGGGGTAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGAGATGGGGAGGAACACCAGTGGCGAAGGCGGCCTGCTGGGCTTTAACTGACGCTGAGGCACGAAAGCGTG
+GTAGCAAACAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATGATTACTAGGTGTGGGGGTCTGACCCTTCGTGCC
+GGAGTTAACACAATAAGTAATCCACCTGGGGAGTACGGCCGCAA
+>313166
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCGACGCAAGTCTGAAGTGAAATACC
+CGGGCTCAACCTGGGAACTGCTTTGGAAACTGTGTTGCTAGAGTGCTGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAAGAACACCAGTGGCGAAGGCGGCTTACTGGACAGTAACTGACGTTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTTGGTGAGCAAAGCTCATCGG
+TGCCGCCGCAAACGCAATAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGGACCCG
+CACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAAATCTTGACAT
+>114455
+TACGTAGGGGGCGAGCGTTGTCCGGAATGATTGGGCGTAAAGGGCGCGTAGGCGGCCTGCTAAGTCTGAAGTGAAAGTCC
+TGCTTTCAAGGTGGGAAGTGCTTTGGATACTGGTGGGCTGGAGTGCAGGAGAGGAAAGCGGAATTACCGGTGTAGCGGTG
+AAATGCGTAGAGATCGGTAGGAACACCAGTGGCGAAGGCGGCTTTCTGGACTGAAACTGACGCTGAGGCGCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCGCGCGGTAAACGATGGATGCCCGCCGTTGGGATTTGGATTTCAGCGG
+CCAAGCGAAAGCGTTAAGCATCCCACCTGGGGAGTACGCCGGCAACGGTGAAACTCAAAGGAATTGACGGGGCCCGCACA
+AGCGGAGGAACATGTGGTTTAATTCGATGATACGCGAGGAACCTTACCCGGGCTTGAATTGCAGGAGAACGATCCAGAGA
+TGGTGACGGCCCTTCGGGCTCCGTGTGGAAGGTGCTGC
+>1941
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGCGGCATGATAAGTCTGATGTGAAAACCC
+AAGGCTCAACCATGGTACTGCATTGGAAACTGTCGTACTAGAGTGTCGGAGGGGTAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGATAACTGACGCTGAGGCTCGAAAGCGTG
+GGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGGGAGCATTGCTCTTCGG
+TGCCGCAGCAAACGCAATAAGTATTCCACCTGGGGATACGTTCGCAAGAATGAAACTCAAAGGAATTGGACGGGGACCCG
+CACGAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAAGTCTTGACATCCCGATGACAAACTA
+TGTAATGTAGCCTCTCTTCGGAGTA
+>131765
+TACGTAGGTGACAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGCGTAGGCGGACTGTCAAGTCAGTCGTGAAATACC
+GGGGCTTAACCCCGGGGCTGCGATTGAAACTGACAGCCTTGAGTATCGGAGAGGAAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACTGTAACTGACGTTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGATTACTAGGTGTTGGTGGGTATGACCTATCGG
+TGCCGCAGCAAACGCAATAAGTAATCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGGACCCG
+CACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGACCTTACCTGGTCTTGACATCCCTATGAATAACGGGC
+AATGCCGTTAGTACTTCGGTACA
+>522087
+TACGTATGGTGCAAGCGCTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCTTAGCAAGTCTGAAGTGAAAGCCC
+GGGGCTCAACCCCGGGACTGCTTTGGAAACTGTTAAGCTGGAGTGCTGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACAGTAACTGACGTTGAGGCT
+>320847
+TACGTAGGGGGGCTAGCGTTATCCGGAATTACTGGGCGTAAAGGGTGCGTAGGTGGTTTTTTAAGTCAGAAGTGAAAGGC
+TACGGCTCAACCGTAGTAAGCTTTTGAAACTAGAGAACTTGAGTGCAGGAGAGGAGAGTAGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAATACCAGTAGCGAAGGCGGCTCTCTGGACTGTAACTGACACTGAGGCACGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAGTACTAGGTGTCCGGGGGTTACCCCCTCGGT
+GCCGCAGCTAACGCATTAAGTACTCCGCCTGGGAAGTACGCTCGCAAGAGT
+>137403
+TACGTAGGGGGCGAGCGTTGTCCGGAATTACTGGGCGTAAAGGGTGCGTAGGCGGCCCTGCAGGTCAGATGTGAAATACC
+CGTGCTTAACATTGGGGCTGCATTTGAAACCGTAGGGCTTGAGTGTGGAAGAGGTAAGTGGAATTCCCGGTGTAGCGGTG
+AAATGCGTAGAGATCGGGAGGAACACCAGTGGCGAAGGCGACTTACTGGGCCACAACTGACGCTGAGGCACGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGGATACTAGGTGTAGAGGGTAACACCCCCTCTG
+TGCCGAAGCAAACGCATTAAGTATCCCGCCTGGGGAGTACGATCGCAAGATTGAACTCAAAGGAATTGACGGGGCCCGCA
+CAAGCAGCGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAAGGCTTGACATCCCTTGAATGCCATAGAA
+TATGGAGTTCCTTCGGGACAAGGAGACGAGGTGGTG
+>335530
+TACGTAGGGAGCGAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGTAGGCGGGATCGCAAGTCAGATGTGAAAACTA
+TGGGCTTAACCCATAAACTGCATTTGAAACTGTGGTTCTTGAGTGAAGTAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACATCAGTGGCGAAGGCGGCTTACTGGGCTTTAACGTGACGCTGAGGCTCGAAGGCGT
+GGGGAGCAAACAGGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGATTACTAGGTGTGGGGGGACTGACCCCTTC
+CGTGCCGCAGCAAACGCAATAAGTAATCCAACCTGGGGAGTACGACCGCAAGGTTGAAACGTCAAAGGATTGACGGGGGC
+CCGCACAAAGCAGTGGGAGTA
+>233805
+TACGTAGGTGGCAAGCGCTGTCCGGATTTACTGGGTGTAAAGGGCGTGTAGCCGGGAGGGCAAGTCAGATGTGAAATCCA
+CGGGCTCAACTCGTGAACTGCATTTGACACTACTCTTCCTGAGTATCGGAGAGGCAATCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGATTGCTGGACGACAACTGCGGTAAAAAAGGGCGCGAAAG
+CGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATGAATACTAGGTGTGCGGGGACTGACCCCC
+TGCGTGCCGCAGCTAACGCAATAAGTATTCCACCTGGGGAGTACGATCGCAAGGTTGAAACTCAAAGGATTGACGGGGCC
+CGCACAAGCGGTGGATTATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAGGCTTGACATCCTACTAACGAGATA
+GAGATATGTTAGGT
+>272261
+TACGTAGGGAGCAAGCGTTATCCGGATTTATTGGGTGTAAAGGGTGCGTAGACGGTAATGCAAGTTAGTTGTGAAATCCC
+TCGGCTTAACTGAGGAACTGCAACTAAAACTACATTTCTTGAGTATTGGAGGGGAAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGATGACTGACGCTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATAGAATACTAGGTGTCGGGGAGCACAGCTCTTCG
+GTGCTGCAGCAACGCAGTAAGTATTCCACCTGGGGAGT
+>257151
+TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGTAGGCGGAGAAGCAAGTCAGAAGTGAAATCCA
+TGGGCTTAACCCATGAACTGCTTTTGAAACTGTTTCCCTTGAGTATCGGAGAGGCAGGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCCTGCTGGACGACAACTGACGCTGAGGCG
+>548184
+TACGTAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGACGTGCGCAGGCGGTTCTGTAAGACAGATGTGAAATCC
+CCGGGCTTAACCTGGGGAGTTGCATTTGTGACTGCAGGACTAGAGTTCATCAGAGGGGGGTGGGAATTCCAAGTGTAGCA
+GTGAAATGCGTAGATATTTTGGAAGAAACACCAATGGCGAAGGCAGCCCCCTGGGATGCGACTGACGCTCATGCACGAAA
+GCGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCCTAAACGATGTCTACTGGTT
+>78203
+AACGTAGGTCACAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAATCCA
+TGGGCTCAACCCATGAACTGCTTTCAAAACTGTTTTTCTTGAGTAGTGCAGAGGTAGGCGGAATTCCCGGTGTAGCGGTG
+GAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGCCTACTGGGCACCAACTGACGCTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCAGTGGTACGTCCACGACCGTAAAGATAGAATACGTAGGT
+>162326
+TGCGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCGGGTTGTTAAGTCAGTTGTGAAAGTTT
+GCGGCTCAACCGTAAAATTGCAGTTGATACTGGCGACCTTGAGTGCAGTTGAGGCAGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCCTGCTAAGCTGCAACTGACATTGAGGCTCGAAAGTGTG
+GGTATCAAACAGGATTAGATACCCTGGTAGTCCACACGGTAAACGATGAATACTCGCTGTTTGCGATATACGGCAAGC
+>183122
+GATAAGCAGCAGGATTTTCCTTATATCCGGCCAGGTATTCGCGCCACTTACTCACGGATCCGTTTCCGGCATAGTAAGTT
+TCAGAGAAACCGGCAGCCTGGTAAGCTCTCAGGTATTCGTCAAGGGAAGCTTGTTTGGGACTGTTGATAGAAGTCTGGAA
+GCCAAAATTGGCGTTGTAGTTCAAGTTGAATTTTTCGCCTTTCTTGGCTTTTTTGGTGGTAATCAGGATAACACCGGCAG
+CAGCACGTGCACCATAGATGGCGGACGATGCCGCATCCTTCAGTACGGTGACGCTTTCAATATCTTCCGGATTCAACAAG
+TCTATATCACCTTCAACGTTGTCGATCAGCACCAGCGGATTCATACCGTTGATGGATACTGTACCACGTATGTTGAAGCT
+TTTTGCTTCACCCGGTGATGCACCACCACTTACCATCAGACCGGGCATAGACCCTGCAGGCATTCTTGACTATTGGTTAC
+AGGTACGC
+>556145
+TACGTAGGGAGCGAGCGTTATCCGGATTTATTGGGTGTAAAGGGTGCGTAGACGGGAAGTCAAGTTAGTTGTGAAATCCC
+TCGGCTTAACTGAGGAACTGCAACTAAAACTGATTTTCTTGAGTACTGGAGAGGAAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCGGTGGCGAAGGCGACTTTCTGGACAGAAACTGACGTTGAGGCACGAAAGTGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATGATTACTAGGTGTGGGGGGACTGACCCCTTCCG
+TGCCGCAGTTAACACAATAAGTAATCCACCTGGGAGTACGGCCGCAAGGTTGAAACTCAAAGGAATTGACGGGGGCCCGC
+ACAAGCAGTGGAGTATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAGGTCTTGACATCGAGTGCATAACTAAGA
+GATTAGTGAAATCCCTTCGGGGACACTTAGACAGGTGGTGCATGGTTGTCGTCAG
+>82548
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGAATGGCAAGTCTGATGTGAAAGGCC
+GGGGCTCAACCCCGGGACTGCATTGGAAACTGTCAATCTAGAGTACCGGAGGGGTAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCGGTGGCGAAGGCGGCTTACTGGACGGTAACTGACGTTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTTGGGGAGCAAAGCTCTTCGG
+TGCCGCAGCAAACGCAATTAAGTATTCCACCTGGGGATACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGACCGCA
+CAAGCGGTGGAGCATGTGGTTAATTCGAAGCAACGCGAAGAACCTTACCAAGTCTTGACATCGATCTGACCGGACCGTAA
+TGGG
+>113510
+TACGTAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGCAGACGGGAGATTAAGTCAGTTGTGAAAGTTT
+GCGGCTCAACCGTAAAATTGCAGTTGATACTGGTTTCCTTGAGTGCAGTTGAGGCAGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACCCCGATTGCGAAGGCAGCTTGCTAAACTGTAACTGACGTTCATGCTCGAAAGTGTG
+GGTATCAAACAGGATTAGATACCCTGGTAGTCCACACGGTAAACGATGGATACTCGCTGTTGGCGATATACTGTCAGCGG
+CCAAGCGAAAGCATTAAGTATCCCACCGTGGGGAGTACCGCCGGCAACGGTGAAACTCAAAGGAATTGACGGGGGCCCGC
+ACAAGCGGAGGAACATGTGGTTTAATTCGATGATACGCGAGGAACCTTACCCGGGCTTAAATTGCAGAC
+>276351
+TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGTGGATAGGTAAGTCAGCGGTGAAAGTTT
+GTGGCTCAACCATAAAATTGCCGTTGAAACTGTTTATCTTGAGTATGTTTGAGGTATGCGGAATGCGTGGTGTAGCGGTG
+AAATGCATAGATATCACGCAGAACTCCGATTGCGAAGGCAGCATACTAAACCATCACTGACACTGAAGCACGAAAGCGTG
+GGTATCAAACAGGATTAGATACCTGGTAGTCCACGCAGTAAACGATGATTACTAGGAGTTTGCGATATACAGTAAGCTCT
+ACAGCGAAAGCGTTAAGTAATCCACCTGGGAGTACGCCGGCACGGTGAACTCAAAGGAATTGACGGGGCCGGCACAAGCG
+GAGGAACATGTGGTTTAATTCGATGATACGCGAGGAACCGTTACCCGGGTTT
+>537429
+AGATCTACGGTGAAGAGATAGCCGCCGCGGTTTCGTCGCCGAACAGCACGCTTGCCTGTGAATATATAACCGCATTGTCA
+AAATATGCACCTAACGCTGAAATTATCCCTGTAAAGCGAGTCGGCGCGGGGCACGATCTTCCACCTCAGGACGGATTTGC
+GAGCGGTTCATATCTTCGCGGGGCTATCCAAAACGGCGGTAATGCACTAGAATATCTGCCGAACGGCGTAAACATCACAC
+CGCCAAAGCAGCCGCAAGCGGCTGAAAATGCCATATATTACCATCTACTGACCGCCACACGCGAACAGTTTTTGCGCCTG
+>342865
+TACGTAGGGAGCAAGCGTTATCCGGATTTATTGGGTGTAAAGGGTACGTAGGCGGCCTCATAAGTCTGTGGTTTAAGCCC
+GAAGCTTAACTTCGGTTCGCCACAGAAACTGTTTGGCTTGAGTATGGTAGAGGCAAGTGGAATTTCTAGTGTAGCGGTTA
+AATGCGTAGATATTAGAAGGAACACCAGTGGCGAAGGCGACTTGCTGGGCCATTACTGACGCTGAGGTACGAAAGCGTGG
+GGAGCAAATAGGATTAGATACCCTAGTAGTCCACGCCGTAAACGATGGATACTAAGTGTCGGGCAACCGGTGCTGAAGTA
+AACACATTAAGTATCCCACCTGAGTAGTACGGTCGCGAGGCTGAAACTCAAAGGAATTGACGGGCACCCGCACAAGCGGT
+GGAGCATGCTGTTTAATTCGATGCTACGCGAAGAACCTTACCTGGGCTTGACATCCCTTTGACCGGACCTAGAAATAGGA
+CC
+>550164
+TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGCGTAGGCGGGATGGCAAGTCAGATGTGAAATCCA
+TGGGCTCAACCCATGAACTTGCATTTGAAACTGTCGTTCTTGAGTATCGGAGAGGCAAGCGGAATTCCTAGTGTAGCGGT
+GAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTGCTGGACGACAACTGACGCTGAGGCGCGAAAGCGT
+GGGGAGCAAACAGGATTAGATAACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTGGGGGGACTGACCCCCTC
+CGTGCCGCAGTTAACACCAA
+>269355
+TACGTAGGTGGCGAGCGTTATCCGGAATCATTGGGCGTAAAGGAGGGAGCAGGCGGCCGCAAGGGTACTGTGGTGAAAGG
+ACCGAAGCCTAAACTTCGGTTAAGCCATGGAAATCGGGCGGCTGGAGTGCGGAAGAGGATCGTGGAATTCCATGTGTAGC
+GGTGAAATGCGTAGATATATGGAGGAACACCAGTGGCGAAGGCGACGGTCTGGGCCGCAACTGACGCTCA
+>588867
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGCGGAAGGCTAAGTCTGGTGTGAAAGCCC
+GGGGCTCAACCCCGGTACTGCATTGGAAACTGGTCATCTAGAGTGTCGGAGGGGTAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGATAACTGACGCTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGAAGCACAGCTTTTCGG
+TGCCGCCGCAAACGCATTAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGGACCCG
+CACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAAGTCTTGACATCCTTCTGATCGGACAG
+TAATGTGTCCTTTCTTCGGGACAGGAAGTGACAGGTGGTG
+>153026
+TACGGAAGGTCCAGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCCGTTTGGTAAGCGTGTTGTGAAATGGT
+CGGGGCTCAACCTGGGCATTGCAGCGCGAACTGCCAGACTTGAGTGCGACAAGAAAGTAGGCGGAATTCGTCGTGTAGCG
+GTAGAAATGCTTAGATAT
+>2538
+TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGAGTAGGCGGGATATCAAGTCAGGTGTGAAATCCA
+TGGGCTCAACCCATGAACTGCACTTGAAACTGATATTCTTGAGTGATGGAGAGGCAGGCGGAATTCCCGGTGTAGCGGTG
+GAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGCCTGCTGGACATTAACTGACGCTGAGGAGCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGGATACTAGGTGTGGGAGGTACTGACCCCTT
+>262076
+TACGTAGGTGGCGAGCGTTATCCGGATTTACTGGGTGTAAAGGGTGTGTAGGCGGGAAGGCAAGTCAGATGTGAAAACTA
+TGGGCTCAACCCATAGCCTGCATTTGAAACTGTTTTTCTTGAGAGTCGGAGAGGTAAGTGGAATTCCCGGTGTAGCGGTG
+AAATGCGTAGATATCGGGAGGAACATCTGTGGCGAAGGCGACTTACTGGACGATTACTGACGCTGAGACACGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAACACTAGGTGTGGGTCGGCGAGACTCCGTCG
+CGCCGCCGACGTACACTAAGTAGTTCGTACCT
+>55727
+TACGGAAGGTCCGGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCCGTCTGTTAAGCGTGTTGTGAAATGTC
+GTGGCTCAACCGGGGCACTGCAGCGCGAACTGGCAGACTTGAGTGCACGGTAGGAAGGCGGAATTCGTCGTGTAGCGGTG
+AAATGCTTAGATATGACGAAGAACTCCGATTGCGAAGGCAGCTTTCCGTAGTGTAACTGACGCTGAAGCTCGAAAGCGTG
+GGTATCAAACAGGATTAGATACCCTGGTAGTCCACACCGTAAACGATGATTACTAGGCGTTGGAGGATTGACCCCTTCAG
+TGCCGCAGTTAACACAATAAGTAATCCACCTGGGGAGTACGACCGCAAGGTTGAAACTCAAAGGATTGGACGGGGGCCCG
+CACAAGCAGTGGAGTATGTGGTTTAATTCGACGCAACGCGAAGAACCTTACCAAGTCTTGACATCCCTTGACGATGCT
+>306684
+TTCCAGCTCCAATAGCGTATATTAACGTTGTTGCAGTTAAAAAGCTCGTAGTTGAAATGAAGGGTAGTTGTGTAATGAAT
+ACATTCGCGTTATTTTGTTATTCTACTACCCTCCTTCTAAATTCGATATATGAGTTATTTAATTTCTTGTATATTGGTTT
+TAGTACTTTTACTGTGAAGTAAAATTAGAGTGTTACAAAGCGA
+>556276
+TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGAGAGCAGGCGGTTATTCAAGTCTGAAGTGAAAAGCA
+GTGGCTTAACCATTGTAGGCTTTGGAAACTAGATAGCTAGAGTGCAAGAGAGGTTAGCGGAACTCCATGTGTAGCGGTGA
+AATGCGTAGATATATGGAAGAACACCAGTGGCGAAGGCGGCTAACTAGCTT
+>578831
+TACGTAGGTGGCGAGCGTTATCCGGAATCATTGGGCGTAAAGAGGGAGCAGGCGGCAACTAGGGTCTGCGGTAAAAAGAC
+CGAAAGCTAAAACTTCGGTAAGCCGTGGAAACCGAGGAGCTAGAGTGCAGTAGAGGATCGTGGAATTCCATGTGTAGCGG
+TGAAATGCGTAGATATATGGAGGAACACCAGTGGCGAAGGCGACGATCTGGGCTGCAACTGACGCTCA
+>268617
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGCGGTGCGGCAAGTCTGATGTGAAAGCCC
+GGGGCTCAACCCCGGTACTGCATTGGAAACTGTCGTACTAGAGTGTCGGAGGGGTAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACGGTGGCGAAGGCGGCTTACTGGACTGTAACTGACGTTGAGGCTCGAAAGCGTGG
+GGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGATTACTAGGTGTTGGGGATTCATAAATCCCCGG
+TGCCGTCGGCAAACGCAATAAGTAATCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGGACGGGGACC
+CGCACAAG
+>236430
+TACGTAGGGAGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGCGCAGGCGGGCCGGTAAGTTGGAAGTGAAATCTA
+TGGGCTTAACCCATAAACTGCTTTCAAAACTGCTGGTCTTGAGTGATGGAGAGGCAGGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGATAACTGACGCTGAGGCTCGAAAGCGTG
+GGGAGACAAACAGGATTAGACT
+>591182
+GGAGTGCATCTGCACCGTTATTGCATCCGGCCTGCCTGTGATGAAGGCCGAGGATACCGAATGCCCCATCGCCTCCCTGC
+GTATCATCTACACCCCGGATACCGAGTACAACGAGTACACCGCAAACTGGATCATCTCCCCGGATACTGCGGTCCCTGCT
+TTGGGCTGACCCTTACACCATATTTTGTTGATAAACGGAGGAACGTATCATGTCTTACACTTTTTCCCGGCGTGATTTTC
+TGAAGTATTCTGCCATGACCGCTGTGGCTGTGGCCGGTGCCGGCCTGCTGACGGGCTGTGAGATCCAGGACCCCAACAAC
+CCTGTTATCAAAAAGCTGGGTTACGGCACGACCCTG
+>260458
+TACGTATGGAGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGTGTAGGTGGTATCACAAGTTCAGAAAGTGAAAGC
+CCGGGGCTACAACCCCGGGACTGCTTTTGAAACTGTGGAACTGGAGTGCAGGAGAGGTAAGTGGAATTCCTAGTGTAGCG
+GTGAAATGCGTAG
+>86556
+TACGTAGGTGGCGAGCGTTATCCGGAATCATTGGGCGTAAAGAGGGAGCAGGCGGCCGCAAGGGTCTGTGGTGAAAGACC
+GAAGCTAAACTTCGGTAAGCCATGGAAACCGGGCGGCTAGAGTGCGGAAGAGGATCGTGGAATTCCATGTGTAGCGGTGA
+AATGCGTAGATATATGGGGGAACACCAGTGGCGAAGGCGACGGTCTGGGCCGCAACCGACGCTCATTCCCGAAAGCGTGG
+GGAGCAAATAGGATTAGATACCCTAGTAGTCCACGCCGTAAACCGATGGTCACTAAGTGTCGGGGGTCAAACCCCCGGTG
+CTGCAGTCAACGCAATAAGTGACCCGCCTGAGTAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGGGCCCGCA
+CAAGCGGTGGAGCATGTGGTTTAATTCGAGCAGCGAAGACCTACAGGTCTTGACATCGATCTAAAAAGGGATGGGAGACA
+TCCTCATAGCTATAGAGAAGACAGGTGGTGC
+>259956
+TACGGAGGGTGCAAGCGTTAATCGGAATAACTGGGCGTAAAGGGCATGCAGGCGGTTCATCAAGTAGGATGTGAAATCCC
+CGGGCTCAACCTGGGAACAGCATACTAAACTGGTGGACTAGAGTATTGCAGGGGGAGACGGAATTCCAGGTGTAGCGGTG
+GAATGCGTAGATATCTGGAAGAACACCAAAGGCGAAGGCACACAGGGGATAGG
+>112749
+TACGTAGGGGACAAGCGTTATCCGGATTTATTGGGCGTAAAGCGTGCGTAGGCGGCATATTAAGTTTAAGATAAAAGCCC
+GGGGCTTAACTCCGGTTCGTCTTAAAAACTGATAAGCTTGAGTGTGGTAGAGGTAAATGGAATTTCTAGTGTAGCGGTGA
+AATGCGTAGATATTAGAAGGAACACCAGTGGCGAAGGCGATTTACTGGGCCACAACTGACGCTGAGGCACGAAAGCGTGG
+GTAGCAAATAGGATTAGATACCCTAGTAGTCCACGCCGTAAACGATGAGTACTAAGTGTTGGGGAAACTCAGTGCTGAAG
+CTAACGTATTAAGTACTCCGCCTGAGTAGTACGGTCGCAAGGCTGAAACTCAAAGGAATTGACGGGCACCCGCACAAGCG
+GTGGAGCATGCTGTTTAATTCGAAGATACGCGAAGAACCTTACCTAGACTTGACATCCCTGGCAAAGATATAGAAATATA
+TCGGAGGTTATCCAGGTGACAGGTGGTGCATGGTTGTCGTCAG
+>304950
+TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGTAGCCGGGTGTGTAAGTCAGATGTGAAATCTG
+GAGGCTCAACCTCCAAACTGCATTTGAAACTGCGCATCTTGAGTATCGGAGAGGTAATCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAAGAACACCAGTGGCGAAGGCGGATTACTGGACGACAACTGACGGTGAGGCGCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATCAATACTAGGTGTGCGGGGACTGACCCCTGCGT
+GCCGCAGTTAACACAATAAGTATTGCACCTGGGGAGTACGATCGCAAGGTTGAAACTCAAAGGAATTGACGGGGGCCCGC
+GACAAGCGGTGGATTATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAGGACTTGACATCCTACTAACGAAGCAG
+AGATGCATTAGGTG
+>3581
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCATCACAAGTCAGAAGTGAAAATCC
+GGGGCTCAACCCCGGAACTGCTTTTGAAACTGTGGAGCTGGAGTGCAGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACTGTAACTGACGTTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGGCCGTAGAGGCTTCGG
+TGCCGCAGCCAACGCAATAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGACCCGC
+ACAAGCGGTGGAGCATGTAGGTTTAATTCGAAGCAACGCGAAGAACCTTACCTGGTCTTGACATCCTTCTGACCGACTCT
+TAATCGTAGTTTTTCCTTCGGGACAGGAGTGAC
+>329327
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGATGGACAAGTCTGATGTGAAAGGCT
+GGGGCTCAACCCCGGGACTGCATTGGAAACTGCCCGTCTTGAGTGCCGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGGTAACTGACGTTGAGGCTCGAAAGCGTG
+GGGAGTAAACAGGATTAGATACCCTGGTAGTCCACGCGGTAAACGATGAATGCTAGGTGTCGGGGAGCAAAGCTCTTCGG
+TGCCGCCGCAAACGCATTAAGCATTCCACCTGGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGGACGGGGACC
+CGCACAAGCGGTGGAGCATGTGGTTTAATCGAAGCAACGCGAAGACCTTACAAGTCTTGACA
+>247518
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGACGTAAGAATAGGATGTTTAAGTCAGTTGTGAAA
+GTTTGCGGCTCAACCGTAAAATTGCAGTTGATAACTGGATATCTTGAGTGCAGTTGAGGCAGGCGGAATTCGTGGTGTAG
+CGGTGAAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCCTGCTAAGCTGCAACTGACATTGAGC
+>591098
+ATACGGAGGATCCGAGCGTTATCCGGATTTAGTTGGGTTTAAAAGGGAGCGTAGGTGGATTGTTAAGTCAGTTGTGAAAA
+GTTTGCGGCTCAACCCGTAAAATTGCAGTTGAAACTAGGCTAGTCTTGAGTACAGTAGAGGTGGGCGGAATTCGTGGTGT
+AGCGGTGAAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTCACTAGACTGCAACTGACACT
+>535000
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGTGCGTAGGTGGCAAGGCAAGTCAGATGTGAAAGCCC
+GGGGCTCAACCCCGGTACTGCATTTGAAACTGTCTAGCTAGAGTGCAGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACTGTAACTGACACTGAGGCACGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACACAGTAAACGATGAATACTCGCTGTTTGCGATATACAGTAAGCGG
+CCAAGGCGAAAGCATTAAGTATTCCACCTGGGGATGCGCCGGCAACGGTGAAACTCAAAGGAATTGGACGGGGGCCCGCA
+CAAGCGGAGGAACAGTGTGGTTTAATTCGATGATACGGCGAGGAA
+>73076
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGCGGTGCGGCAAGTCTGATGTGAAAGCCC
+GGGGCTCAACCCCGGTACTGCATTGGAAACTGTCGTACTAGAGTGTCGGAGGGTAAGTGGAATTCCTAGTGTAGCGGTGA
+AATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGATAACTGACGCCGAGGCTCGAAAGCGTGG
+GGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGGAGCGATTGCTTCTTCG
+GTGCCGCAGCAAACGCAATAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATAGAAACTCAAAGGATGACGGGACCGCA
+CAAGCGGTGAGC
+>538794
+TACGGAGGATGCAAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGCACGCCAAGTCAGCGGTGAAATTTC
+CGGGCTCAACCCGGACTGTGCCGTTGAAACTGGCGAGCTAGAGTACACAAGAGGCAGGCGGAATGCGTGGTGTAGCGGTG
+AAATGCATAGATATCACGCAGAACCCCGATTGCGAAGGCAGCCTGCTAGGGTGAAACAGACGCTGAGGCACGAAAGCGTG
+GGTATCGAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTGGGGGACTGACCCTTCCGTG
+CCGCAGTTAACACAATAAGTATTCCACCTGGGGAGTACGACCGCAAGGTT
+>546975
+TACGTAGGTTGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGTAGGCGGAGATGCAAGTTGGGAGTGAAATCCA
+TGGGCTCAACCCATGAACTGCTCTCAAAACTGTATCCCTTGAGTATCGGAGAGGCAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTGCTGGACGACAACTGACGCTGAGACACGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATGCTAGGTGTAGGGTCGGAAACGGGTCTGT
+GCCGCAGCTAACGCGATAAGCATTCCACCTGGGGA
+>341275
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGATGGATATTTAAGTCGGTTGTGAAAGTTT
+GCGGCTCAACCGTAAAATTGCAGTTGATACTAGGATTATCTTGAGTGCAGTTGAGGCGGGCGGAATTCGTGGTGTAGCGG
+TGAAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCCTGCTAAGCTGCAACTGACATTGAGGCTACGAAAGT
+GTGGGTATCAAAC
+>512598
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCATCACAAGTCAGAAGTGAAAATCC
+GGGGCTCAACCCCGGAACTGCTTTTGAAACTGTGGAGCTGGAGTGCAGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACTGTAACTGACGTTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCACGTGGTACGTCC
+>573607
+AACGTAGGTCACAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAATACC
+AGTGGGCTCCAAACCCATGAACTTGCTTTCAAAACTGTTTTTCTTGAGTAGTGCAGAGGTAGGCGGAATTCCCGGTGTAG
+CGGTGGAATAGCGTAGATATCGGGAGGAACACCAAGTGGCGAAGGCGGCCTACTGGGCACCAACTGACGCTGAGGCTCGA
+AAGTGT
+>113626
+ATACGGAGGATGCGAGCGTTATCCGGATTTATTAGGGTTTAAAGGGAGCGCAGACGGGTCGTTAAGTCAGCTGTGAAAGT
+TTGGGGCTCAACCTTAAAATTGCAGTTGATAACTGGTCGTCCTTGAGTGCGGTTGAGGTGTGCGGAATTCGTGGTGTAGC
+GGTGAAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCACACTAATCCGTAACTGACGTTC
+>223948
+TACGTAGGTTGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGTAGGCGGAGATGCAAGTTAGGAGTGAAATCTA
+TGGGCTCAACCCATAAACTGCTTCTAAAACTGTATCCCTTGAGTATCGGAGAGGCAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTGCTGGACGACAACTGACGCTGGAGGCGACGAAAGCG
+GTGGGGAGACAAACAGGATTAGACTACCCTGGTAGTCCACGCTAGTAAACGATAGAA
+>254706
+TACGGAGGATCCGAGTGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCTGTTTTTTAAGTTAGAGGTGAAAGCTC
+GACGCTCAACGTCGAAATTGCCTCTGATACTGAGAGACTAGAGTGTAGTTGCGGAAGGCGGAATGTGTGGTGTAGCGGTG
+AAATGCTTAGATATCACACAGAACACCGATTGCGAAGGCAGCTTTCCAAGCTATTACTGACGCTGAGGCACGAAAGCGTG
+GGGAGCGAACAGGATTAGATACCCTGGTAGTCCACGCAGTAAACGATGATAACTCGTTGCCGGCGATACACAGTCGGTGA
+CTTAGCGAAAGCGTTAAGTTATCCACCTGGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGATTGACGGGGGCCCGCAC
+GAAGCGGAGGAACATGTGGTTTAATCGATGATACGCGAGG
+>258006
+TACGTATGGTGCAAGCGTTACCCGGATTTACTGGGTGTAAAGGGTGCGTAGGTGGCAAGGCAAGTCTGAAGTGAAAATCC
+GGGGCTCAACCCCGGAACTGCTTTGGAAACTGTTTAGCTAGAGTACAGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGACTTACTGGACTGCTACTGACACTGAGGCACGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACACGGTAAACGATGAATACTCGCTGTTTGCGATATACGGCAAGCGG
+CCAAGCGAAAGCGTTAAGTATTCCACCTGGGGAGTACGCCGGCAACGGTGAAACTCAAAGGATTGACGGGGCCCGCACAA
+GCGGAGGAACATGTGGTTTAATTCGATGATACGCGAGGAACCTTACCCGGGCTTAAATTGCACTCGAATGATCCGGAAAC
+GGTTCAGCTAGCAATAGCGAGTGTGAAGGTGCTGCA
+>111986
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGCGGTACGGCAAGTCTGATGTGAAATCCC
+GGGGCTCAACCCCGGTACTGCATTGGAAACTGTCGGACTAGAGTGTCGGAGGGGTAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGATTACTGACGCTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGGAGCATTGCTCTTCGG
+TGCCGCAGCAAACGCAATAAGTATTCCACCTGGGGATACGGTTCGCAAGATGAAACTCAAAGGATTGGACGGGGACCCGC
+ACAAGCGGTGGAGCATGTGGTTTAATCGAAGCAACGCGAAGAACCTTACCAAGTCTTGACATCCCATGAACAAGTATGTA
+ATGTACTTTCTCTTCGGA
+>113959
+TACGTAGGTGGCGAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGCGTAGGCGGGAATGCAAGTCAGATGTGAAATCCA
+GGGGCTTAACCCTTGAACTGCATTTGAAACTGTATTTCTTGAGTGTCGGAGAGGTTGACGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGTCAACTGGACGATAACTGACGCTGAGGCTCGAAAGCGTG
+GGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGGAGCATTGCTCTTCGGT
+GCCGCAGCAAACGCAAATAAGTATTCCACCTGGGGGAGTACGTTCGCAAGAATGAAACTC
+>280233
+TACGTAGGGGGCGAGCGTTGTCCGGATTTACTGGGCGTAAAGAGTGCGTAGGTGGCTTTGCAAGTCAGATGTGAAATACC
+GGGGCTTAACCCCGGGGCTGCATCTGAAACTGTAGAGCTAGAGTACAGGAGGGGAAGGCGGAATTCCTAGTGGAGCGGCG
+AAATGCGTAGAGATTAGGAGGAACACCGGTGGCGAAGGCGGCTTTCTGGACTGCAACTGACACTGAGGCACGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGGATACTAGGTGTGGGGTCGAATAGATTCCGTG
+CCGCAGCAAACGCAATAAGTATCCCGCCTGGGGAGTACGGCCGCAAGGTTGAAACTCAAAGGAATTGACGGGGGCCGCAC
+AAGCAGCGGAGCATGTGGTTTAATTCGAAGCAACGCGAAAAACCTTACCAGAACTTGACATCCCGCTGAATGTATTGGAA
+GCAATGCAGGCTTACGCAAGTAAGACAGCGGTGACA
+>553285
+TACGTAGGTTGCGAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGTAGGCGGAGATGCAAGTTGGGAGTGAAATCCA
+TGGGCTCAACCCATGAACTGCTTCCAAAACTGTATCCCTTGAGTATCGGAGAGGCAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTGCTGGACGACAACTGACGCTGAGGCGCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATGAATACTAGGTGTGGGAGGACTGACCCCTTCCG
+TGCCGCAGTTAACACAATAAGTATTCCACCTGGGGATACGACCGCAAGGTTGAAACTCAAAGGAATTGACGGGGGCCCGC
+ACAAGCAGTGGAGTATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAGGTCTTGGACATCCGATGCATAGCACAG
+AGATGTGTGAAATCCTTCGGGACTAG
+>238997
+CACGTATGGAGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGTGTAGGTGGCCAGGCAAGTCAGAAGTGAAAGCCC
+GGGGCTCAACCCCGGGACTGCTTTTGAAACTGCAGGGCTAGAGTGCAGGAGGGGCAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTGCTGGACTGTAACTGACACTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGGCACATAAGTGCTTCG
+GTGCCGCAGCAAACGCAATAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGATTGACGGGGACCCG
+CACAAGCGGTGGAGCATGTGGTTTAATCGAAGCAACGCGAAGAACCTTACCAAGTCTTAGACATCCCACGTGACCCGGAC
+AGTAATGTGTCCCTTCCCGTTCGGGGCAGTGGAGACGAGGTGG
+>212596
+TACGGGGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGCGCGTAGGCGGGACGTCAAGTCAGCGGTAAAAGACT
+GCAGCTAAACTGTAGCACGCCGTTGAAACTGGCGCCCTGGAGACGAGACGAGGGAGGCGGAACAAGTGAAGTAGCGGTGA
+AATGCATAGATATCACTTGGAACCCCGATAGCGAAGGCAGCTTCCCAGGCTCGTTCTGACGCTGATGCGCGAGAGCGTGG
+GTAGCGAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGCTCACTGGATCTTGGCGATACACGGCCAGGGTT
+CAAGCGAAAGTATTAAGTGAGCCACCTGGGGAGTACGTCGGCAACGATGAAACTCAAAGGAATTGACGGGGGCCCGCACA
+AGCGGAGGAACATGTGGTTTAATTCGATGATACGCGAGGAACCTTACCGGGTTTAAATGTAG
+>204462
+TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGCAGCTGGGAATGCAAGTCAGATGTGAAATCCA
+TGGGCTTAACCCATGAACTGCATTTGAAACTGTATTTCTTGAGTACTGGAGAGGCAATCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGATTGCTGGACAGCAACTGACGGTGAGGCGCGAAAGTGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACACTGTAAACGATGAATACTAGGTGTGCGGGGACTGACCCCCTGCG
+TGCCGCAGTTAACACAATAAGTATTCCACCTGGGGAGTACGATCGCAAGGTTGAAACTCAAAGGAATTGACGGGGGCCGC
+ACAGCGGTGGATTATGTGGTTTAATTTCGAAGCAACGCGAAGAAACCTTACCAGGGTTTTGACATCCTGCTAACGAAGTA
+GAGATACATTA
+>589802
+TACGTAGGTGGCGAGCGTTGTCCGGAATTACTGGGTGTAAAGGGTGCGTAGGCGGCTTCTAAAGTCAGATGTGAAATACC
+GCAGCTCAACTGCGGGGCTGCATTTGAAACTTGGGAGCTTGAGTGAAGTAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+GAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGATAACTGACGCTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGGGCACAACGTGCTTCG
+GTGCCGCAGCTAACGCAATAAGTATTCCACCTGGGGATACGTTCGCAAGAATGAAACTCAAAGGATTGACGGGGACCCGC
+ACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAGTCCTTGACATCCTGCTGGCCGTTCCTA
+ACCGGACTTTTCTTCGGAACAGCAGAGACAGGTGG
+>272782
+ATACGTAGGTGGCAAGCGTTGTCCGGAATTATTGGGCGGTAAAGCGCGCGCAGGCGGCTTCTTAAGTCCATACTTAAAAG
+TGCGGGGCTTAACCCCCGTGATGGGATGGAAACTAGGGAGGTCTGGAGTATCGGAGAGGAAAGTGGAATTCCTAGTGTAG
+CGGTGAAATGCGTAGAGATTAGGAA
+>229514
+TACGTAGGGAGCAAGCGTTATCCGGATTTATTGGGTGTAAAGGGTGCGTAGACGGTAATGCAAGTTAGTTGTGAAATCCC
+TCGGCTTAACTGAGGAACTGCAACTAAAACTACATTTCTTGAGTATTGGAGGGGAAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGATTGCTGGACAGCAACTGACGGTGAGGCGCGAAAGTGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACACTGTAAATGATGAATACTAGGTGTGCGGGGACTGACCCCCTGCG
+TGCCGCAGTTAACACAATAAGTATTCCACCTGGGGAGTACGATCGCAAGGTTGAAACTCAAAGGAATTGGACGGGGGCCC
+GCACAAGCGGTGGATTA
+>250451
+TACGTAGGGGGCAAGCGTTGTCCGGAATAATTGGGCGTAAAGGGCGCGTAGGCGGCTCGGTAAGTCTGGAGTGAAAGTCC
+TGCTTTTAAGGTGGGAATTGCTTTGGATACTGTCGGGCTTGAGTGCAGGAGAGGTTAGTGGAATTCCCAGTGTAGCGGTG
+AAATGCGTAGAGATTGGGAGGAACACCAGTGGCGAAGGCGACTAACTGGACTGTAACTGACGCTGAGGCGCGAAAGTGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACACTGTAAACGATGAATGCTAGGTGTAGGGGGTATCGACCCTTCTG
+TGCCGCAGTTAACACAATAAGCATTCCGCCTGGGGAGTACGGCCGCAAGGTTGAAACTCAAAGGAATTGACGGGGGCCCG
+CACAAGCAGCGGAGCATGTGGTTTAATTCGACGCAACGCGAAGAACCTTACCAGGGTTTGACATCCTGCTAACGAAGTAG
+AGATACATTAGGTG
+>574961
+TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGGGTGCGTAGGCGGCGAGATAAGTCTGAGGTAAAAGCCC
+GTGGCTCAACCACGGTAAGCCTTGGAAACTGTCTGGCTGGAGTGCAGGAGAGGACAATGGAATTCCATGTGTAGCGGTAA
+AATGCGTAGATATATGGAGGAACACCAGTGGCGAAGGCGGTTGTCTGGCCTGTAACTGACGCTGAAGCACGAAAGCGTGG
+GGAGCAAATAGGATTAGATACCCTAGTAGTCCACGCCGTAAACGATGAGAACTAAGTGTTGGGGAAACTCAGTGCTGCAG
+TTAACGCAATAAGTTCTCCGCCTGGGGAGTATGCACGCAAGTGTGAAACTCAAAGGAATTGACGGGGGCCCGCACAAGCG
+GTGGAGTATGTGGTTTAATTCGACGCAACGCGAAGAACCTTACCAGGCCTTGACATGGTATCAAAGGCCCTAGAGATAGG
+GAGATAGTAAT
+>139346
+TACGGAAGGTCCGGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCCGGAGATTAAGCGTGTTGTGAAATGTA
+GTGGCTCAACCTCTGCACTGCAGCGCGAACTGGTCTTCTTGAGTACGCACAACGTGGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTCACGGGAGCGCAACTGACGCTGAAGCTCGAAAGTGCG
+GGTATCGAACAGGATTAGATACCCTGGTAGTCCGCACGGTAAACGGTGGAGTGCCCGGCCTGTTGGTCCGTGAATAAGGT
+CAGCGGCCAAGCGAAGACATTAAGACAT
+>588882
+TACGTAGGGGGCGAGCGTTATCCGGATTTATTGGGCGTAAAGCGTGTGTAGGCGGTTTATTAAGTTTAAGATTAAAGCCC
+GGGGCTCAACTCCGGTAAGTCTTAAAAACTGGTAGACTTGAGTACGGTAGAGGCAAACGGAATTTCTAGTGTAGCGGTGA
+AATGCGTAGATATTAGAAGGAACACCAGTGGCGAAAGCGGTTTGCTGGGCCGTTACTGACGCTGAGGCACGAAAGCGTGG
+GGAGCAAATAGGATTAGATACCCTAGTAGTCCACGCCGTAAACGATGAGTACTAAGTGTTGGGGTTACCCAGTGCTGAAG
+CTAACGTATTAAGTACTCCGCCTGAGTAGTACGGTCGCAAGGCTGAAACTCAAGGAATTAGTACGGGACGACCCGCCACA
+ACGACGGT
+>138816
+TACGTAGGGGGCAAGCGTTGTCCGGAATGATTGGGCGTAAAGGGCGCGTAGGCGGCCTGGTAAGTTTGGAGTGAAAGTCC
+TGCTTTTAAGGTGGGAATTGCTTTGAAAACTGCTGGGCTTGAGTGCAGGAGAGGTAAGTGGAATTCCCGGTGTAGCGGTG
+AAATGCGTAGAGATCGGGAGGAACACCAGTGGCGAAGGCGACTTACTGGACTGTTACTGACGCTGAGGCGCGAAAGTGTG
+GGGAGCAAACAGGATTAGA
+>28512
+TACGTAGGGGGCAAGCGTTGTCCGGAATAATTGGGCGTAAAGGGCGCGTAGGCGGCTCGGTAAGTCTGGAGTGAAAGTCC
+TGCTTTTAAGGTGGGAATTGCTTTGGATACTGTCGGGCTTGAGTGCAGGAGAGGTTAGTGGAATTCCCAGTGTAGCGGTG
+AAATGCGTAGAGATTGGGAGGAACACCAGTGGCGAAGGCGACTAACTGGACTGTAACTGACGCTGAGGCGCGAAAGTGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACACTGTAAACGATGAATGCTAGTGTAGGGGGTATCGACCCTTCTGT
+GCCGCAGTCAACACAATAAGCATTCCGCCTGGGGAGTACGGCCGCAAGGTTGAAACTCAAAGGAATTGACGGGGGCCCGC
+ACAAGCGGTGGAGTATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAGGGCTTGACATCCCGGTGACCCGGACTA
+GAGATAGTCCTTTCCC
+>279590
+TACGGTAGGGGGCGAGCGTTATCCGGATTTATTGGGCGTAAAGCGTGTGTAGGCGGTTTATTAAGTCCGAGATGAAAGGC
+TGAGGCTTAACCTCAGTTTTGTTTACGGAAACTGGTAGACTAGAGTGCAGTAGAGGCAATTGGAATTCATAGTGTAGCGG
+TAAAATGCGTAGATATTATGAGGAACATCAGTGGCGAAGGCGAATTGCTGGGCTGTTACTGACGCTGAGACACGAAAGCG
+GTGGGGAGACAAAGTAGGATTAGACTACCCTAGTA
+>329074
+TACGGAAGGTCCGGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGCAGGCCGTCCTTTAAGGCGTGCTGTGAAATGC
+CGCGGCTCAACCGTGGCACTGCAGCGCGAACTGGAGGACTTGAGTACGCACGAGGTAGGCGGAATTCGTGGTGTAGCGGT
+GAAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTTACCGGAGCGCAACTGACGCTGAGGCTCGAAAGCGC
+GGGTATCGAACAGGATTAGATACCCTGGTAGTCCGCGCGGTAAACGATGGATGCCCGCCGCTGGGATTTGGATTTCAGCG
+GCCAAGCGAAAGCGTTAAGCATCCCACCTGGGGAGTACGCCGGCAACGGTGAAACTC
+>581733
+TACGTAGGGGGCAAGCGTTGTCCGGAATAATTGGGCGTAAAGGGCGCGTAGGCGGCTCGGTAAGTCTGGAGTGAAAGTCC
+TGCTTTTAAAGGTGGGAATTGCTTTGGATGCTGTCGGGCTTGAGTGCAGGAGAGGTTAGTGGAATTCCCAGTGTAGCGGT
+GAAATGCGTAGAGATTGGGAGGAACACCAGTGGCGAAGGCGACTAACTGGACTGTAACTGACGCTGAGGCGCGAAAGCGT
+GGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTTGGTTCCAAAGGAACTCGG
+TGCCGTCGCAAACGCATTAAGTATTCCACTGGGATACGTTCGCAAGAATGAAACTCAAAGGATTAGACGGGACCCGCACA
+GCGGTGGAGCATGT
+>1718
+TACGTAGGTGGCGAGCGTTATCCGGAATCATTGGGCGTTAAAGAGGGAGCAGGCGGCCGCAAGGGTCTGTGGTGAAAGAC
+CGAAGCTAAACTTCGGTTAAGCCTTGGAAACCGGGCGGCTAGGAGTGCGGAGAGGATCGTGGAATTCCATGTGTAGCGGT
+GAAATGCGTAGATATATGGAGGAACACCAGTGGCGAAGGCGACGGTCTGGGCCGCAACTGACGCTCATTCCCGG
+>582971
+TACGGAGGATCCTAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGATGGGTTGTTAAGTCAGTTGTGAAAGTTT
+GCGGCTCAACCGTAAAATTGCAATTGATACTGGCAGTCTTGAGTACAGTTGAGGTAGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTTACTAAACTGCAACTGACATTGATGCTCGAAAGTGTG
+GGTATCAAACAGGATTAGATACCCTGGTAGTCCACACGGTAAACGATGAATACTCGCTGTTGGCGAGTACACAGCCAGCG
+GCCAAGCG
+>508980
+TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGTAGCCGGGAAGGCAAGTCAGATGTGAAATCCA
+CGGGCTTAACTCGTGAACTGCATTTGAAACTACTTTTCTTGAGTATCGGAGAGGCAATCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGATGTCTGGACGGACAACTAGACGGTGGAGG
+>63221
+TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGTAGCCGGGTCGGCAAGTCAGATGTGAAATCTG
+GAGGCTCAACCTCCAAACTGCATTTGAAACTGCCGGTCTTGAGTATCGGAGAGGTAATCGGAATTCCTTGTGTAGCGGTG
+AAATGCGTAGATATAAGGAAGAACACCAGTGGCGAAGGCGGTTTACTGGGCCATAACTGACGCTGAGGCACGAAAGCGTG
+GGGAGCAAATAGGATTAGATACCCTAGTAGTCCACGCCGTAAACGATGAGTACTAAGTGTTGGGGCAACCCAGTGCTGAA
+GTTAACACATTAAGTACTCCGCCTGAGTAGTACGGTCGCAAGGCTGAAACTCAAAGGAATTGACGGGCACCCGCGACAAG
+CAGTGGAGCATGCTGTTAATTCGAAGATACGCGAAGAACCTTACCAGGGCTTGACATCCC
+>577111
+TACGGAAGGTCCGGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCCACAGGTTAAGCGTGTTGTGAAATGTA
+GGGGCTCAACCTCTGCACTGCAGCGCGAACTGGCTTGCTTGAGTACGCACAACGTGGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTCGACGGGAGCGCAACTGACGCTGAAG
+>136740
+TACGTAGGTGGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGTAGGCGGGATGCCAAGTCAGCTGTGAAAACTA
+TGGGCTTAACTTGTAGACTGCAGTTGAAACTGGTATTCTTGAGTGAAGTAGAGGTTGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTCACTGGAGTGTAACTGACGCTGATGCTCGAAAGTGTG
+GGTATCAAACAGGATTAGATACCCTGGTAGTCCACACAGTAAACGATGAATACTCGCTGTTTGCGATATACAGTAAG
+>144091
+AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAAACCA
+TGGGGCTCAACCCATGAATTGCTTTCAAAACTGTTTTTCTTGAGTAGTGCAGAGGTAGATGGAATTCCCGGTGTAGCGGT
+GGAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGTCTACTGGGCACCAACTGACGCTGA
+>239291
+TACGTAGGGAGCAAGCGTTATCCGGATTTATTGGGTGTAAAGGGTGCGTAGACGGGAGTGCAAGTTGGTTGTGAAATCCC
+TCGGCTTAACTGAGGAACTGCAACCAAAACTACATTTCTTGAGTGCTGGAGAGGAAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGACAGTAACTGACGTTGAGGCACGAAAGTGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACACCGTAAACGATGGATACTAGGTGTAGGGGATGAATAATCTTCTG
+TGCCGTCGCAAACGCAGTAAGTATCCCACCTGGGGAGTACGGCCGCAAGGTTGAAACTCAAAGGATTGACGGGGGCCCGC
+ACAAGCAGTGGAGTATGTGGTTTAATTCGACGCAACGCGAAGACCTTACAGGGCTTGACATATACATGAAAATCTAAGAG
+ATTAGAATCCCTCTTTCGGAGCATGTATACGAGGTGGTG
+>244496
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGCGGTGCGGCAAGTCTGATGTGAAAGCCC
+GGGGCTCAACCCCGGTACTGCATTGGAAACTGTCGTACTAGAGTGTCGGAGGGGTAAGTGGAATTTCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGAAGGAACACCAGTGGCGAAAGCGGTTTGCTGGGCCGTTACTGACGCTGAGGCACGAAAGCGTG
+GGGAGCAAATAGGATTAGATACCCTAGTAGTCCACGCCGTAAACGATGAGTACTAAGTGTTGGAATTTCCAGTGCTGAAG
+CTAACGTATTAAGTACTCCGCCTGAGTAGTACGGTCGCAAGGCTGAAACTCAAAGGAATTGACGGGCACCCGCACAAGCG
+GTGGAGCATGCTGTTTAATTCGAAGATACGCGAAGAACCTTACCTAGACTTGACATCCCTGGCAAAGCTATAGAAATATA
+GTGGAGGCTATCCAGGTGAC
+>206632
+TACGTAGGTGGCGAGCGTTATCCGGAATCATTGGGCGTAAAGAGGGAGCAGGCGGCCGCAAGGGTCTGTGGTGAAAGACC
+GAAGCTAAACTTCGGTAAGCCATGGAAACCGGGCGGCTAGAGTGCGGAAGAGGATCGTGGAATTCCATGTGTAGCGGTGA
+AATGCGTAGATATATGGAGGAACACCAGTGGCGAAGGCGACGGTCTGGGCCGCAACTGACGCTCATTCCCGAAAGCGTGG
+GGAGCAAATAGGATTAGATACCCTAGTAGTCCACGCCGTAAACGATGGTCACTAAGTGTCGGGGGTCAAACCCCGGTGCT
+GCAGTCAACGCAATAAGTGACCCGCCTGAGTAGTACGTTCGCAAGAATGAAACTCAAAGGATTGACGGGGCCCGCACAAG
+CGTGGAGCATGTGGTTTAATCGAAGCAACGCGAAGAACCTTACGAGGGCTTGACATCCTAACTACGAGATAGAGATATGT
+AGTG
+>222914
+TACGTATGGAGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGTGTAGGTGGCATCACAAGTCAGAAGTGAAAGCCC
+GGGGCTCAACCCCGGGACTGCTTTTGAAACTGTGGAGCTGGAGTGCAGGAGAGGCAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTGCTGGACTGTAACTGACACTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGGCCCATAAGGGCTTCG
+GTGCCGCAGCAAACGCAATAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGATTGACGGGGACCCG
+CACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAGCGCGAAGAACCTTACCAAGTCTTGACATCCTCTTGCCCGGTCAG
+TAATGTGACCTTTTCTACGGAACAAGAGTGACA
+>242989
+TACGGAAGGTCCGGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCCGGAGATTAAGCGTGTTGTGAAATGTA
+AGTGGCTCAACCTCTGCACTGCAAGCGTCGTAACTGGTCTTCTTGAGTACGCACAACGTGGGCGGAATTCGTGGTGTAGC
+GGTGAAATGCTTAGATATCACGAGAACTCCGATTGCGAAGGCAGCTCACGGGAGCGCAACTGACGCTGAAGCTCGAAAGT
+GCGGGTATC
+>238929
+AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAAACCA
+TGGGCTCAACCCATGAAATTGCTTTCAAAAACTGTTTTCTTGAGTAGTGCAGAGGTAGATGGAATTCCCGGTGTAGCGGT
+GGAATGCGTAGATATCGGGAGAACACCAAGTGGCGAAGGCGGTCTACTGGGCACCAGCTGACGCTGAGGCTCGAAAGCAT
+GGGTAAGCAAACAGGAATTAGATACCCTGGTAGTCCATGCCGTAAACGA
+>155970
+TACTGCCGCATGGTGGTGAGCATGTTGTCGATGTTGAAGTCGGACTTCTCCACCTTGATGTCACGCTGTGCCAGTTCCTT
+GCGGTAGTCGTCACGCATATACTCGTAGAAGGTGTTGAACGACGGCACGATGCTACGGTCGGATTGGATGCGCTCAATAT
+A
+>137099
+TACGTAGGTGGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGTGCGCAGGCGGGATTGCAAGTTGGATGTGAAATACC
+GGGGCTTAACCCCGGAGCTGCATCCAAAACTGTAGTTCTTGAGTGGAGTAGTGGTAAGCGGAATTCCGAGTGTAGCGGTG
+AAATGCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGGCTCTAACTGACGCTGAGGCACGAAAGCATG
+GGTAGCAAACAGGATTAGATACCCTGGTAGTCCATGCTGTAAACGATGAATGCTAGGTGTGGGGGGACTGGACCCTTCCG
+TGCCGGAGTAAACACAATAAGCATTCCACCTGGGGAGTACGGACCGCGAACGGTTGAAACTCAAAAGGAATTAGGACGGG
+GGCCCG
+>533173
+TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGCAGCCGGGCCGGTAAGTCAGATGTGAAATCTA
+TGGGCTCAACCCATAAATTGCATTTGAAACTGCTGGTCTTGAGTACTGGAGAGGCAGACGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGTCTGCTGGACAGCAACTGACGGTGAGGCGCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATGGATACTAGGTGTGCGGGGACTGACCCCCTGCG
+TGCCGCAGTTAACACAATAAGTATCCCACCTGGGGAGTACGATCGCAAGGTTGAAACTCAAAGGAATTGACGGGGGCCCG
+CACAAGCGGTGGATTATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAGGGCTTGACATCCTACTAACGAAGTAG
+AGATACATTA
+>544430
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGCCTTTTAAGTCAGCGGTGAAAGTCT
+GTGGCTCAACCATAGAATTGCCGTTGAAACTGGGGGGCTTGAGTATGTTTGAGGCAGGCGGAATGCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGCAGAACCCCGATTGCGAAGGCAGCCTGCCAAGCCATGACTGACGCTGATGCACGAAAGCGTG
+GGGATCAAACAGGATTAGATACCCTGGTACGTCCACGCAGTAAACGATGATCACTAGTCTTGTTTGCGATACAGTGTAAG
+CGGCACAGACGAAAGTCGTTAAGTGACTCCGACGTGGGGACGTACGGCCGGAC
+>309328
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGGCGGCGGAGCAAGTCAGAAGTGAAAGCCC
+GGGGCTCAACCCCGGGACGGCTTTTGAAACTGCCCTGCTTGATTTCAGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGACTTACTGGCCTGCAACTGACACTGAGGCACGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGATTACTAGGTATGGGAGGTATCGACTCCTTCC
+GTGCCGCAGCAAACGCAATAAGTAATCCGCCTGGGGAGTACGGCCGCAAGGTTGAAACTCAAAGAAATTGACGGGGACCC
+GCACAAGCAGCGGAGCATGTGGTTTAATTCGACGCAACGCGAAAACCTTACCAGGTCTTGACATCCTTTGACCGCCTGAG
+AGATCAGGAATCTCTAGCAATAGAGCAGAGAGACA
+>588740
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGCCTTTTAAGTCAGCGGTGAAAGTCT
+GTGGCTCAACCATAGAATTGCCGTTGAAACTGGGGGGCTTGAGTATGTTTGAGGCAGGCGGAATGCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGCAGAACCCCGATTGCGAAGGCAGCCTGCCAAGCCATGACTGACGCTGATGCACGAAAGCGTG
+GGGATCAAACAGGATTAGATACCCTGGTAGTCCACGCAGTAAACGATGAACTCACTAGCTGTTTGCGATACAGTGTAACG
+CGGCACAGCGAAAGCGTTAAAGTGATCCACCTGGGGGAGTTACGCCGGCCAACGGTGAAACTCAAAGGAATTGAC
+>200064
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGGCGGCGATGCAAGTCAGAAGTGAAAGCCC
+AGGGCTTAACCGTGGGACTGCTTTTGAAACTGTGTTGCTGGATTGCCGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGGTGAATGACGCTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGGCCCAAAAGGGTTCCG
+GTGCCGCAGCAAACGCGAATAAGTATTCCACCTGGGGAGTACGTT
+>582509
+TACGTAGGGAGCAAGCGTTATCCGGATTTATTGGGTGTAAAGGGTACGTAGGCGGCCTCATAAGTCTGTGGTTTAAGCCC
+GAAGCTTAACTTCGGTTCGCCACAGAAACTGTTTGGCTTGAGTATGGTAGAGGCAAGTGGAATTTCTAGTGTAGCGGTTA
+AATGCGTAGATATTAGTAAGGAACGACCACGTGGCGGAA
+>524765
+TACGTAGGTGCAAGCGTTATCCGGAATTAGTTGGGCGTAAAGGGCTCGTAGGCGGTTCGTCGCGTCCGGTGTGAAAGTCC
+ATCGCTTAACGGTGGATCCGCGCCGGGTACGGGCGGGCTTGAGTGCGGTAGGGGAGACTGGAATTCCCGGTGTAACGGTG
+GAATGTGTAGATATCGGGAAGAACACCAATGGCGAAGGCAGGTCTCTGGGCCGTTACTGACGCTGAGGAGCGAAAGCGTG
+GGGAGCGAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGGTGGATGCTGGATGTGGGGCCCGTTCCACGGGTTC
+CGTGTCGGAGCTAACGCGTTAAGCATCCCGCCTGGGGAGTACGGCCGCAAGGCTAAAACTCAAAGAAATTGACGGGGCCC
+GCACAAGCGGCGGAGCATGCGGATTAATTCGATGCAACGCGAAGACCTACTGGGCTTGACATGTTCCCGAGGTCGTAGAG
+ATACGGCTTCCCTTTCGGGGCGGGTTCACGAGGTGGTGCATGGTCGTCGTCAG
+>259028
+TACGTAGGTGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGCGTAGGCGGGGATGCAAGTCAGATGTGAAATCTA
+TGGGCTTAACCCATAAACTGCATTTGAAACTGTATCTCTTGAGTGCTGGAGGGGTAGACGGAATTCCTTGTGTAGCGGTG
+AAATGCGTAGATATAAGGAAGAACACCAGTGGCGAAGGCGGTCTACTGGACAGTAACTGACGCTGAGGCGCGAGAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATGAATACGTAGGTGTGGGAGGACTGACCCCTTCC
+GTGCCGCAGTTAACACAATAAGTATT
+>533036
+TACGTAGGGAGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGTAGGCGGGGAAACAAGTTGAATGTGAAATCCA
+TGGGCTCAACCCATGGCTGCGTTCAAAACTGTATCTCTTGAGTGAAGTAGAGGCAGGCGGAATTCCTAGTGTAGCGGTGG
+AATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCCTGCTGGGCTTTTACTGACGCTGAGGCTCGAAAGCGTGG
+GTAGCAACAGGATTAGATACCCTGGTAGTCCACGCGGTAAACGATGATTACTAGGTGTGGGGTGGACTGACCCCATCCGT
+GCCGGAGTTAACACAATAAGTAATCCACCTGGGGAGTACGGCCGCAAGGTTGAAACTCAAAGGAATTGACGGGGGCCCGC
+ACAAGCAGTGGAGTATGTGGTTTAATTCGACGCAACGCGAAGAACCTTACCAGGTCTTGACATCGAGTGACGAGCGTAGA
+GATACGCTTTCCCTTCGGGCACGAAGCAGGTGGTGCTAG
+>200049
+TACGTAGGGGGCAAGCGTTGTCCGGAATAATTGGGCGTAAAGGGCGCGTAGGCGGCTCGGTAAGTCTGGAGTGAAAGTCC
+TGCTTTTAAGGTGGGAATTGCTTTGGATACTGTCGGGCTTGAGTGCAGGAGAGGTTAGTGGAATTCCCAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTGCTGGACGACAACTGACGCTGAGGCGCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATGAATACTAGGTGTGGGAGGACTGACCCCTTCCC
+GTGCCCGCAGTTAACACAATAAGTATTCCACCTGGGGAGTACGACCGCAAGGTTGAAACTCAAAGGAATTGAACGGGGGC
+CCGCGACGAAG
+>179213
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGCGGCATGATAAGTCTGATGTGAAAACCC
+AAGGCTCAACCATGGGACTGCATTGGAAACTGTCGTGCTGGAGTGTCGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGATGACTGACGCTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGGTGGTCGGGGAGCACAGCTCGTT
+CGGTGCCGCAGCAAACGCAGTAAGTA
+>2783
+AGGTAGCGTACCACTCCTGATTGGACTGGAAGGGTACCTCCGGGTGCCCTCCTGATAGAACAGGTCGGCTTCTTCCACGT
+TCAGCTCCGGGTAGGGGTGGAGTGTAGCTGGCGGTGAAGGGCAACTGGCTGCCCTCGGTGTCTGTGTCTGTGGGCAAAAG
+ACACATCCGGTCGGCGTCTGCGGGTCCGTGTCAAATAGCCCTCGTCGGTGTCGCGGCCGGTCTCCAGCCAGTTGCCCTCC
+AGCGACCACGGGCTGGTGTCGGTCTCCCAGTCGTCCCCATCCGGGTAGATGGTAAAAT
+>352654
+TACGTAGGTTGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGTAGGCGGAGATGCAAGTTAGGAGTGAAATCTA
+TGGGCTTAACCCATAAACTGCTTCTAAAACTGTACTCCTGAGTATCGGAGAGGCAAGGACGGTAACTTCCTAGTGTAGCG
+GTAGAAATGCGTAGATTAGTTA
+>269075
+TACGTAGGGAGCGAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGTAGGCGGGATCTTAAGTCAGGTGTGAAAACTA
+TGGGCTCAACCCATAGACTGCACTTGAAACTGAGGTTCTTGAGTGAAGTAGAGGCAGGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACATCAGTGGCGAAGGCGGCCTGCTGGGCTTTTACTGACGCTGAGGCTCGAAAGTGTG
+GTATCAAACAGGATTAGATACCCTGGTAGTCCACACGGTAAACGATGAATACTCGCTGTTTGCGATATACTGCAAGCGGC
+CAAGCGAAAGACGTTAAGTATTCCACCTGGGGAGTACGCCGGCAACGGTGAAACT
+>542353
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCGGACGCTTAAGTCAGTTGTGAAAGTTT
+AGTCGGCTCAACCGTAAAAACTTAGCAGTTGATACTGGGTGGTCTTGTAGTACAGTAGAGGCGAGGCGGAATTCGTGGTG
+TAGCGGTGAAATGCTTAGATATCACGAAGAACTCCGATTGCG
+>244840
+CCTACGTAGGTGGCGAGCGTTGTCCGGAATTACTGGGTAGTAAAGGGAGTGTAGGCGGGAAGGCAAGTCAGAAGTAGAAA
+ATTATGGGCTTAACCCATAACCTGTCTTTTGTAAACTTGTTTTTCTTGAGTGAGGCAGAGCAAGCGGAATTCCTAGTGTA
+GCGGTGAGATGCGTAGATA
+>513639
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCGATGCAAGTCTGAAGTGAAATACC
+CGGGCTCAACCTGGGAACTGCTTTGGAAACTGTGTTGCTAGAGTGCTGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAAGAACACCAGTGGCGAAGGCGACTTTCTGGACGACAACTGACGCTGAGGCGCGAAAGCGTG
+GGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTAGGAGGTATCGACCCCTTCTG
+TGCCGGAGCTAACACAATAAGTATTCCGCCTGGGAAGTACGATCGCAAGATTAAAACTCAAGGGAA
+>278955
+CACGGGGGATGCGAGCGTTATCCGGATTCATTGGGTTTAAAGGGAGCGTAGGCGGCCCGACAAGTCAGCGGTAAAAGACT
+GCAGCTAAACTGTAGCGCGCCGTTGAAACTGCCGGGCTAGAGTGCAGACGAGGTTGGCGGAACAGGTGAAGTAGCGGTGA
+AATGTATAGATATCACCTGGAACCCCGACAGCGAAGGCAGCTGACCAGGCTGTAACTGACGCTGATGCTCGAGAGCGTGG
+GTAGCGAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGCTCACTGGCCCTGGGCGACATACAGTCCGGGGT
+CAGCGAAAGTAAAGTGAGCCACCTGGGGATACGTCGGCAACGATGAAACTCAAAGGATTGGACGGGGGCCCGCACAAGCG
+GAGGAACATGTGGTTAATTCGATGATACGCGAGGAACCTTACCGGGTTTAAATGCAGGTT
+>321885
+TACGTAGGGAGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGTGTAGGCGGGACTGCAAGTCAGATGTGAAATTTA
+GGGGCTCAACCCCTGAACTGCATTTGAAACTGTGGTTCTTGAGTGAAGTAGAGGTAAACGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACATCAGTGGCGAAGGCGGTTTACTGGGCTTTTACTGACGCTGAGGCCTACGAAGCGG
+TGGGGAGACAAACAGGTATTAGACTACCCTGGTACGTCCACGCTAGTAAACGATG
+>567145
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGATGGATGTTTAAGTCAGTTGTGAAAGTTT
+GCGGCTCAACCGTAAAATTGCAGTTGATACTGGATATCTTGAGTGCAGTTGAGGCAGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCCTGCTAAGCTGCAACTGACATTGAGGCTCGAAAGTGTG
+GGTATCAAACAGGATTAGATACCCTGGTAGTCCACACGGTAAACGATGAATACTCGCTGTTTGCGATAGTACAGCAAGCG
+GCCAAGCGAAAGCGTTAAGATTCCACCTGGGGATACGCGGCAACGGTGAACTCAAAGGAATTGACGGGGGCCCGCGACCA
+AG
+>572242
+AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGACCGGCAAGTTGGAAGTGAAAACTA
+TGGGCTCAACCCATAAATTGCTTTCAAAACTGCTGGCCTTGAGTAGTGCAGAGGTAGGTGGAATTCCCGGTGTAGCGGTG
+GAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGACCTACTGGGCACCAACTGACGCTGAGGCTCGAAAGCATG
+GTAGCAACAGATTAGATACCTGGTAGTCCATGCCGTAAACGATAGATTACTAGGTGTTGGAGGATTGACCCTTCCAGTGC
+CGCAGTTAACACAATAAGTAATCCACCTGGGGAGTACGA
+>329570
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGATGGATATTTAAGTCAGTTGTGAAAGTTT
+GCGGCTCAACCGTAAAATTGCAGTTGATACTGGATATCTTGAGTGCAGTTGAGGCAGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCCTGCTAAGCCGCAACTGACATTGAGGCTCGAAAGTGTG
+GGTATCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTGGGGGGACTGACCCTTCCGT
+GCCGCAGTTAACACAATAAGTATTCCACCTGGGAGTACGACCGCAAGGTTG
+>239490
+TACGTATGGAGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGGCGGTCCTGCAAGTCTGATGTGAAAGGCC
+GGGGCTCAACCCCGGGACTGCATTGGAAACTGTAGGACTAGAGTGTCGGAGGGGTAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGACTACTGACGCTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTTGGCAGGTAAGACCTGTCGG
+TGCCGCAGCAAACGCAATAAGTATTCCACCTGGGGATACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGGACCCGC
+ACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAAGTCTTGACATCCCGGTGACAGAACATG
+TAATGT
+>213873
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGCGGTGCGGCAAGTCTGAAGTGAAAGCCC
+GGGGCTTAACCCCGGGACTGCTTTGGAAACTGTAGGACTAGAGTGCAGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACTGTAACTGACGTTGAGGCTCGAAAGCGTG
+GGGAGCAAACATGATTAGATACCCTGGTAGTCCACGACCGTAAACGATGATTACGTAGGTGTTGGTGGGTAT
+>554003
+TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGCAGCCGGGAATGCAAGTCAGATGTGAAATCCA
+TGGGCTTAACCCATGAACTGCATTTGAAACTGTATTTCTTGAGTACTGGAGAGGCAATCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGATTGCTGGACAGCAACTGACGGTGAGGCGACGAAAGTGG
+TGGGGAGACAAAC
+>182894
+TACGGAAGGTCCGGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGCAGGCCGTCCTTTAAGCGTGCTGTGAAATGCC
+GCGGCTCAACCGTGGCACTGCAGCGCGAACTGGAGGACTTGAGTACGCACGAGGTAGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTTACCGGAGCGTAACTGACGCTGAGGCTCGAAAGCGCG
+GTATCGAACAGGATTAGATACCCTGGTAGTCCGCGCGGTAAACGATGGATGCCCGCCGTTGGGATTTGGATTTCAGCGGC
+CAAGCGAAAGCGTTAAGCATCCCACTGGGGAGTACGCCGGCAACGGTGAAACTC
+>88335
+ACCTGCCAGTTTCACCTTATCACGGTCTACATCGAAATAGAGCTGCGGGATATCTGCCTGCAAAGATGATGATAACCCGG
+ATAATTCCTTGCGTTGCGAAGCGTAATACATCAACGTATCCGTCGCTTGTACCAGATTATCAAAAGTAGCGTCTCCACGT
+GCTTCGAGTTGCATCTCGAAACCACCCGAACTTCCCAGCCCCGGGATAACCGGCGGTGTGGAAAGATATACCTTACACTC
+GGGATACTCCTGCAAATCCTTTTCACCTGAGCCATTACCTCATCGATAGTCTGTCCGTCACGTGCACCCCACGGTTTCAG
+AATAACTGTCAGTTCACTACGTGCCTGACTGCTACCCACACGGGGGCTACTACCTACTACACTCTGAACATACTCTACAG
+CCGGATTCAACATCAGATATTCTATAGCACGTTCCGATACGATTCGCGTACGCTCCAGCGTCGCACCTTCAGGCAAC
+>89763
+TACGGGAAGGTTCAGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGCAGGCGGACCTTTAAGTCAGCTGTGGAAATA
+CGGCGGCTCAACCGTCGAACTGCAGTTGATACTGGAGGTCTTGAGTGCACACAGGGATACTGGAATTCATGGTGTAGCGG
+TGAAATGCTCAGATATCATGAAGAACTCCGATCGCGAAGGCAGGTATCCGGGGT
+>248299
+TACGTAGGGAGCGAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGTAGGCGGGATCTTAAGTCAGGTGTGAAACTAT
+GGGCTCAACCCATAGACTGCACTTAGAAACGTGAGGTTCTTGAGTGAAGTAGAGGCAGGCGGAATTCCTAGTGTAGCGGT
+GAAATGCGTAGATATTAGGAGGAACATCCAGTGGCGAAGGCGGCCTGCTGGGCTTTTACTGACGCTGAGGCTCGAAAGCG
+TGGGGAGC
+>136780
+TACGTAGGGGGCAAGCGTTGTCCGGAATAATTGGGCGTAAAGGGCGCGTAGGCGGCTCGGTAAGTCTGGAGTGAAAGTCC
+TGCTTTTAAGGTGGGAATTGCTTTGGATACTGTCGGGCTTGAGTGCAGGAGAGGTTAGTGGAATTCCCAGTGTAGCGGTG
+AAATGCGTAGAGATTGGGAGGAACACCAGTGGCGAAGGCGACTAACTGGACTGTAACTGACGCTGAGGCGCGAAAGTGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACACTGTAAACGATGAATGCTAGGTGTAGGGGGTATCGACCCTTCTG
+TGCCGCAGTCAACACAATAAGCATTCCGCCTGGGGAGTACGGCCGCGAAGGTTGAAACGTACAAGGAAAGTTAGTACGGG
+GGACCCCGGGCCGACCGAAACGACGACGACGGACGGGACGA
+>368027
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGATGGATGTTTAAGTCAGTTGTGAAAGTTT
+GCGGCTCAACCGTAAAATTGCAGTTGATACTGGATATCTTGAGTGCAGTTGAGGCAGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCCTGCTAAGACTGCAACTGACATT
+>106480
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGGTGGCAAGGCAAGCCAGGAAGTGAAAACC
+CGGGCTCAACCGCGGGATTGCTTTTGGAACTGTCATGCTAGAGTGCAGGAGGGGTGAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCGGAGGCGAAGGCGGCTCACTGGACTGTAACTGACACTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCGGTAAACGATGAATACTAGATGTCGGGTAGCAAAGCTATTCGG
+TGTCGTCGCAAACGCAATAAGTATTCCACGTGGGATCGTCGCAAGATG
+>296956
+TACGTAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGGGTGTGCAGGCGGTTTTGCAAGATGGATGTGAAAGCCC
+CGGGCTTAACCTGGGAAAGCCATACATGACTGCAAGACTAGAGTGCGTCAGGAGGGGGTGGAATTCCAAGTGTAGCAGTG
+AAATGCGTAGATA
+>319724
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGAGTGGCAAGTCTGATGTGAAAACCC
+GGGGCTCAACCCCGGGACTGCATTGGAAACTGTCAATCTAGAGTACCGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAGCACCAGTGGCGAAGGCGGCTTACTGGACGGTAACTGACGTTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGACTACTAGGTGTCGGGCAGCAAAGCTGTTCGG
+TGCCGCAGCAAACGCAATAAGTAGTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGGGCCCG
+CACAAGCGGAGGAACATGTGGTTTAATTCGATGATACGCGAGGAACCTTACCGGGCTTAAATTGCAGATGAATTACGGTG
+AAAGCCGTAAGCCGCAAGGCATCTGTGAAGGTGCTGCA
+>254133
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGGCGGTTTGGCAAGTCAGAAGTGAAAGCCC
+AAGGCTTAACCATGGGACTGCTTTTGAAACTGTCAGACTAGATTGCAGGAGAGGTAAGTGGAATTCCTGGTGTAGCGGTG
+AAATGCGTAGATATCAGGAGGAACACCGGTGGCGAAGGCGGCTTACTGGACTGTAAATGACGCTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCGGTAAACGATGAATACTAGATGTTGGGGAGCGAAGCTCTTCGG
+TGTCGCAGCAAACGCAATAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGG
+>70379
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCATCACAAGTCAGAAGTGAAAATCC
+GGGGCTCAACCCCGGAACTGCTTTTGAAACTGTGGAGCTGGAGTGCAGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACTGTAACTGACGTTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTTGGGTTTCATAAGAAGCTCG
+GTGCCGGCGCAAACGCATTAAGTATTCCACCTGGGGATACGTTCGCAAGATGAAACTCAAAAGGAATTGACGGGGGACCC
+GCACGAAGCGGTGGAGCAGTGTGGTTTAATCGAAGCAACG
+>65695
+TACGGAAGGTCCGGGCGTTATCCGGATTTATTGGTTTAAAGGGAGGCGTAAGGCCGTTTGGTAAGCGTGTTGTGAAATGT
+CGGGGCTCAAACCTGGGCATTGCCAGCGCGAACTGCCAGACTTGAGTGCGCAGGAAGTAGGCGGAATTCGTCGTGTAGCG
+GTGAAATGCTTAGATATGACGAAGAACTCCGATTGCGAAGGCAGCCT
+>534516
+TACGTAGGTGGCGAGCGTTATCCGGAATCATTGGGCGTAAAGAGGGAGCAGGCGGCCGCAAGGGTCTGTGGTGAAAGACC
+GAAGCTAAACTTCGGTAAGCCATGGAAACCGGGCGGCTGGAGTGCGGAAGAGGATCGTGGAATTCCATGTGTAGCGGTGA
+AATGCGTAGATATATGGAGGAACACCAGTGGCGAAGGCGACGGTCTGGGCCGCAACTGACGCTCATTCCCGAAAGCGTGG
+GGAGCAAATAGGATTAGATACCCTAGTAGTCCACGCCGTAAACGATGGTCACTAAGTGTCGGGGGTCAAACCCCGGTGCT
+GCAGTCAACGCAATAAGTGACCCGCCTGAGAGTTTTTTACGTCGCAAGAATGAAACTCAAAAGGATTGGACGGGGGCCC
+>206278
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGCGGTGCGGCAAGTCTGATAGTCGGAAAG
+CCCGGGGCCGTCAACCCCGGTACTTGACATTGGAAACTGTCGTACTAGAGTGTCGGAGGGTAAGCGGAATTCCTAGTGTA
+GCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGATAACTGACGCTGAGGCTCGAA
+AGCGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTTGGGAAGC
+>218985
+AACGTAGGGTGCGAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAAAAC
+CATGGGCTCAACCCATGAATTGCTTTCAAAACTGTTTTTCTTGAGTAGTGCAGAGGTAGATGGAATTCCCGGTGTAGCGG
+TGGAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGTCTACTGGGCACCAACTGACGCTGA
+>240228
+CATGCAGGTGGAAGTGGACGGCTCCCGTGTGGACTGCTTCGAGCTGGTGCAGACCCGGGACGCTTCCGAGGTGGAGGACC
+ACAAGATCGTTGTGGACGGTCCCGAGATTGACGAGATCCCCGTCGGCTCCAAGATTTCTCTGAGCTACACCGTGGAAGTC
+GCCGGCAAGGCCATGCAGCCCGACTTTGGGTCCGTCATGGAGCGTAAGATGCACTCCTGGATCAACTGCATCGAGGGCGT
+TA
+>261663
+TACGTAGGTGGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGTAGGCGGGCAGGCAAGTCAGGCGTGAAATATA
+TCGGCTCAACCGGTAACGGCGCTTGAAACTGCAGGTCTTGAGTGAAGTAGAGGTTGGCGGAATTCCTAGTGTAGCGGTGA
+AATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGACTTACTGGACTGTAACTGACGTTGAGGCACGAAAGTGTGG
+GGAGCAAACAGGATTAGATACCCTGGTAGTCCGCACCGTAAACGATGGATACTAGGTGTAGGATGTGTTAAACATTCTGT
+GCCGTCGCAAACGCAGTAAGTATCCCACCTGGGGAGTACGGCCGCAAGGTTGAAACTCAAAGGAATTGACGGGGGCCCGC
+ACAAGCAGTGGAGTATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAGGGCTTGACATATACAGGAATATATTAG
+AGATAGTATAGCTCTTCGGAGTCTGTATACAGGTG
+>513763
+AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAAACCA
+TGGGCTCCAACCCATGAATTGCTTTCAAAACTGTTTTTCTTGAGTAGTGCAGAGGTAGATGGAATTCCCGGTGTAGGCGG
+TGGAATGCGTAGATATCGGGAGGACACCAGTGGCGAAGGCGGTCTACTGGGCACCAACGTGACGCT
+>533078
+AACGTAGGTCACAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGGAAGGCAAGTTGGAAGTGAAATCC
+ATGGGCTCAACCATGAACTGCTTTCAAAACTTGTTTTTTCTTGAGTAGTGCAGAGGTAGGCGGAATTCCCGGTGTAGCGG
+TGGAATGCGTAGATAGTCGGGAGGAACACCA
+>509572
+TACGTAGGGGGCGAGCGTTGTCCGGAATTACTGGGCGTAAAGGGTGCGTAGGCGGTTGCTTAAGTTGGATGTGAAATACC
+CGGGCTTAACTTGGGGGGTGCATTCAAGACTGGGGAACTAGAGTACAGGAGAGGGAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCGGTGGCGAAGGCGGCTTTCTGGACTGACACTGACGCTGAGGCACGAAAAGCGT
+GGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATGAATACTAGGTGTAGGGGGTATGAACTCCCT
+CTGTGCCGCAGCAAACGCAATAAGTATTCGCCTGGGAGTACGGCCGCAAGGTTGAAACTCAAAGGAATTGACGGGGGACC
+CGGC
+>521924
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGCGGTCTGGCAAGTCTGATGTGAAATCCC
+GGGGCTCAACTCGGAATTGCATTGGAAACTGTCAGACTAGAGTGCCGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTGA
+AATGCGTAGATATTAGGAGGAACACCGGTGGCGAAGGCGGCTTACTGGACGGTAACTGACGCTGAGGCTCGAAAGCGTGG
+GGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGGCACAAAAGTGCTTCGG
+TGCCGCAGCAAACGCATTAAGTATTCCACCTGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGGACCCGC
+GACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAAGTCTTGACATCCCGTTGACCTTGTTA
+TGTAATGTAACATCTCTTCGGAGCAACGGAGAC
+>536982
+AACGTAGGTGGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGAAGCGCAAGTTGGATGTGAAACCCA
+TGGGCTCAGCCCATGGCCTGCATCCAAAACTGTGTTTCTTGAGTAGTGCAGAGGTAGGCGGAATTCCCGGTGTAGCGGTG
+GAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGCCTACTGGGCACCAACTGACGCTGAGGCTCGAAAGCATG
+GGTAGCAAACAGGATTAGATACCCTGGTAGTCCATGCCGTAAACGATGATTACTAGGTGTTGGAGGATTGACCCCTTCAG
+TGCCGCAGTTAACACAATAAGTAATCCACCTGGGGATACGACCGCAAGGTTGAAACTCAAAGGAATTGACGGGGGCCCGC
+ACAAGCAGTGGAGTATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAGGTCTTGACATCGGATGCATAGTGCAGA
+GATGCATGAAGCCC
+>278709
+TACGGAGGATTCAAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGTTTGATAAGTTAGAGGTGAAATCCC
+GGGGCTTAACTCCGGAACTGCCTCTAATACTGTTAGACTAGAGAGTAGTTGCGGTAGGCGGAATGTATGGTGTAGCGGTG
+AAATGCTTAGAGATCATACAGAACACCGATTGCGAAGGCAGCTTACCAAACTATATCTGACGTTGAGGCACGAAAGCGTG
+GGGAGCAAATAGGATTAGATACCCTAGTAGTCCACGCCGTAAACGATGGTCACTAAGTGTCGGGGTCAAACCCCGGTGCT
+GCAGTCAACGCAATAAGTGACCCGCCTGAGTAGTACGTTCGCAAGAATGAAACTCAAAGGAATGGACGGGGCCCGCACAA
+GCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAGGTCTTTGACATCGATCGTAAAAAGGGATGGA
+GACATCCTCA
+>304270
+TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGCGTAGGCGGGATGGCAAGTCAGATGTGAAATCCA
+TGGGCTCAACCCATGAACTGCATTTGAAACTGTCGTTCTTGAGTATCGGAGAGGCAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTGCTGGACGACAACTGACGCTGAGGCGCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTGGGGGGACTGGACCCCCTCC
+GTGCCGCAGTTAACACAATAA
+>262115
+TACGTAGGTGGCGAGCGTTATCCGGAATTATTGGGCGTAAAGAGGGAGCAGGCGGCACTAAGGGTCTGTGGTGAAAGATC
+GAAGCTTAACTTCGGTAAGCCATGGAAACCGTAGAGCTAGAGTGTGTGAGAGGATCGTGGAATTCCATGTGTAGCGGTGA
+AATGCGTAGATATATGGAGGAACACCAGTGGCGAAGGCGACGATCTGGCGCATAACTGACGCTCAGTCCCGAAAGCGTGG
+GGAGCAAATAGGGATTAGATACCCTAGTAGTCCACGCCGTAAACGATGAGTACTAAGTGTTGGGAGTCAAATCTCAGTGC
+TGCAGTTAACGCAATAAGTACTCCGCCTGAGTAGTACGTTCGCAAGAATGAAACTCAAACGGAATTAGACGGGGGCCCGC
+GACAAGCGGTGGAGCATGTGGTTTAATTCGAAG
+>68350
+TACGTATGGAGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGTGTAGGTGGCCATGCAAGTCAGAAGTGAAAATCC
+GGGGCTCAACCCCGGAACTGCTTTTGAAACTGTAAGGCTAGAGTGCAGGAGGGGTGAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTCACTGGACTGTAACTGACACTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAACTGTTTGCGATACAATGTAAGCGG
+TACAGCGAAAAGCGTTTAAGTACTTCCACCGTGGGGAGTACGCCGGCAACGGTGAAACTCAAAGGAATTGACGGGGGCCC
+GCACAAGCGGAGGAACATGTGGTTTAATTCGATGATACGCG
+>112263
+TACGTAGGGGGCTGGCGTTATCCGGAATTACTGGGCGTAAAGGGTGCGTAGGTGGTTTCTTAAGTCAGAGGTGAAAGGCT
+ACGGCTCAACCGTAGTAAGCCTTTGAAACTGGGAAACTTGAGTGCAGGAGAGGAGAGTGGAATTCCTAGTGTAGCGGTGA
+AATGCGTAGATATTAGGAGGAACACCAGTTGCGAAGGCGGCTCTCTGGACTGTAACTGACACTGAGGCACGAAAGCGTGG
+GGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAGTACTAGCCGTCGGAGGTTACCCCCTTCGGTG
+GCGCAGCTAACGCATTAAGTACTCCGCCTGGGAAGTACGCTCGCAAGAGTGAAACTCAAAGGAATTGACGGGGACCCGCA
+CAAGTAGCGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCCTTACCTAAGCTTGACATCCTTTTGACCGATGCCT
+AATCG
+>215890
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGATGGATGTTTAAGTCAGTTGTGAAAGTTT
+GCGGCTCAACCGTAAAATTGCAGTTGATACTGGATATCTTGAGTGCAGTTGAGGCAGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCCTATAAGCTGCAACTGAC
+>157625
+TACGTAGGGAGCAAGCGTTATCCGGATTTATTGGGTGTAAAGGGTGCGTAGACGGGAGAACAAGTTAGTTGTGAAAGCCC
+TCGGCTTAACTGAGGAACTGCAACTAAAACTATTTTTCTTGAGTTGCAGGAGAGGAAAGCGGAATTCCTAGTGTAGCGGT
+GAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTTCTGGACTGTAACTGACGTTGAGGCACGAAAGTGT
+GGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACACCGTAAACGATGGATACTAGGTGTAGGAGATGGATTTCATCAT
+CCTGTGCCGAAAGCAAACGC
+>560842
+ATACGGAGGATCCGAGCGTTATCCGGATTTAGTTGGGTTTAAAGGGAGCGTAGGCGGATTGTTAAGTCAGTTGTGAAAGT
+TTGCGGCTCAACCCGTAAAATTGCAGTTGATAACTGGTCAGTCTTGAGTGCAGTAGAGGTGGGCGGAATTCGTGGTGTAG
+CGGTGAAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTCAC
+>591496
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCGCAGCAAGTCTGATGTGAAAGGCA
+GGGGCTTAACCCCTGGACTGCATTGGAAACTGCTGTGCTTGAGTGCCGGAGGGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGGTAACTGACGTTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGCACAAAAGTGCTTCGG
+TGCCGCAGCAAACGCATTAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGGACGGGGACCC
+GCAACAAGCGGTGGAGCATGTAGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAGTCCTTGACA
+>167215
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGGTGGTATGGCAAGTCAGAGGTGAAAACCC
+AGGGCTTAACCTTGGGATTGCCTTTGAAACTGTCAGACTAGAGTGCAGGAGGGGTAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACTGTAACTGACACTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTTGGGGAGCGAAGCTCCTCGG
+TACCGCAGTTAACGCAATAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGACCCGC
+ACAAGCGGAGGAACATGTGGTTTAATTCGATGATACGCGAGGAACCTTACCCGGGCTTAAATTGCATTTGAATATATTGG
+AAACAGTAATAGCCGTAAGGCAAATGT
+>185339
+AACGTAGGTCACAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAATCCA
+TGGGCTCAACCCATGAACTGCTTTCAAAACTTGTTTTTCTTGAGTAGTGCAGAGGTAGGCGAATTCCCGGTGTAGCGGTG
+GAATGCGTAGATATCGGGAGAACACCAGTGGCGAAGGCGCCTACGTGGGCACCAACTGACGCTGAGGCT
+>570930
+TACGTAGGTGGCGAGCGTTATCCGGATTTATTGGGACGTAAAGCGTCCGCAGCCGGTTTATTAAGTCTAGAATTAAAGCC
+TGGAGCTCAACTACCAGTTCGTTTTAGAAACTGATAGACTCGAGTGTGGTAGAGGCAAACGGAATTTCTAGTGTAGCGGT
+AGAATGCGTAGATATTAGAAGGAACACCAGTGGCGAAGGCGGTTTGCTAGGCCACCACTGACGGTCATGGACGAAAGCGT
+GGGGAGCAAATAGGATTAGATACCCTAGTAGTCCACGCTGTAAACGATGAGTACTAAGTGTCGGGTTAACCGGTGCTGAA
+GTTAACACATTAAGTACTCCGCCTGAGTAGTACGGTCGCAAGGCTGAAACTCAAAGGAATTGACGGGGACCCGCACAAGC
+GGTGGAGCAATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAAATCTTGACATCCCTCTGACCCGGGACTTAACC
+CGTCCCTTTCTTTCGGGACAGAGGAGACAGGTGGTGCA
+>289855
+AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAAACCA
+TGGGCTCAACCCATGAATTGCTTTCAAAACTGTTTTTCTTGAGTAGTGCAGAGGTAGATGGGAATTCCCGGTGTAGCGGT
+GGAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAAGGCGGTCTACTGGGCACCAACTGACGCTG
+>313089
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGCGGCATGATAAGTCTGATGTGAAAACCC
+AAGGCTCAACCATGGGACTGCATTGGAAACTGTCGTGCTGGAGTGTCGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AGATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGATGACTGACGCTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATGAATACTAGGTGTGCGGGACTGACCCCCTGCGT
+GCCGCAGCTAACGCAATAAGTATTCCACCTGGGGAGTACGATCGCGGTAGGTAACGTACAACGAAGTTAGTACGGGGACC
+GGGCCGACAACGACGGTCGGGTAGTT
+>50120
+TACGTAGGTTGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGTAGGCGGAGATGCAAGTTAGGAGTGAAATCTA
+TGGGCTCAACCCATAAACTGCTTCTAAAACTGTATCCCTTGAGTATCGGAGAGGCAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTGCTGGACGACAACTGACGCTGAGGCGCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATGAATACTAGGTGTGGGAGGACTGACCCCTTCCG
+TGCCGCAGTTAACACAATAAGTATTCCACCTGGGGAGTACGCCGGCAACGGTGAAACTCAAAGGAATTGACGGGGCCGCA
+CAAGCGGAGGAACATGTGGTTTAATTCGATGATACGCGAGGAACCTTACCCGGGCTCAAACGCAGGGGAATGTCGGTGAA
+AGCCGG
+>578409
+TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGCAGCCGGGCATGCAAGTCAGATGTGAAATCTC
+AGGGCTTAACCCTGAAACTGCATTTGAAACTGTATGTCTTGAGTGCCGGAGAGGTAATCGGAATTCCTTGTGTAGCGGTG
+AAATGCGTAGATATAAGGAAGAACACCAGTGGCGAAGGCGGATTACTGGACGGTAACTGACGGTGACGGCGCGAAAGGCG
+TGGGGAGCGAACAGGGATTACGATACCCTGGTAGTCCAACGCTGTAAACGATGGATACGTAGGTCGTGCGGGGACTGACC
+CCCTGCGTGACCGCAGTTAA
+>69664
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGTGGATTGTTAAGTCAGTTGTGAAAGTTT
+GCGGCTCAACCGTAAAATTGCAGTTGAAACTGGCAGTCTTGAGTACAGTAGAGGTGGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTCACTAGACTGTCACTGACACTGATGCTCGAAAGTGTG
+GGTACTACAAACGAGGTATTAGACT
+>109792
+TACGTAGGTGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGTGTAGGCGGGATGGTAAGTCAGATGTGAAAACCA
+TGGGCTCAACCCATGGCCTGCATTTGAAACTGCTGTTCTTGAGTGATGGAGAGGCAGGCGGAATTCCGTGTGTAGCGGTG
+AAATGCGTAGATATACGGAGGAACACCAGTGGCGAAGGCGGCCTGCTGGACATTAACTGACGCTGAGGCGCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATGGATACTAGGTGTGGGGGGTCTGACCCCCTCCG
+TGCCGCAGTTACACAATAAGTATCCCACCTGGGGAGTACGATCGCAAGGTTGAAACTCAAAGGAATTGGACGGGGCCGCA
+CAAGCGGTGGAGTA
+>526490
+ATACGTAGGGAGCAAGCGTTATCCGGATTTATTGGGTAGTAAAGGGTGCGTAGACGGGAAATTAAGTTAGTTGTGAAATC
+CCTCGGCTCAACTGAGGAACTGCAACTAAAAACTGGTTTTCTTGAGTGCAGGAGAGGTAAGTGGAATTCCTAGTGTAGCG
+GTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGACTTACTGGACTGTAACTGACGTTGAGGCACGAAAGT
+GTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACACCGTAAACGATGGATACTAGGTGTAGGGTGTATTAAGCTAT
+TCTGTGCCGTCGCAAAACGCAAGTAAGTACTCCCACCGTGGGGA
+>510817
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCTTTGCAAGTCTGATGTGAAAGGCG
+GGGGCTCAACCCCTGGACTGCATTGGAAACTGTGGGGCTTGAGTGCCGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGGTAACTGACGTTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCGGTAAACGATGAATACTAGGTGTTGGGTGTCACAGACATTCGG
+TGCCGCAGCAAACGCAATAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGGACCCG
+CACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAAGTCTTGACATCTCCCTGACAGAGTAT
+GTAATGTACTTTTCCTT
+>538805
+AACGTAGGTCACAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAGAACAAGTTGGAAGTGAAATCCA
+TGGGCTCAACCCATGAACTGCTTTACAAAACTTGTTTTTCTTGAGTAGTGCAGAGGTAGGCGGAATTCCCGGTGTAGCGG
+TGGAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGCCTACTGGGCACCAACTGACGCTGAGGCTACGAAAGT
+GGTGGGTAGACAAACAGGATTAGACTACCC
+>334839
+TACGGAAGGTCCGGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCCGGAGATTAAGCGTGTTGTGAAATGTA
+GTGGCTCAACCTCTGCACTGCAGCGCGAACTGGTCTTCTTGAGTACGCACAACGTGGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTCACGGGAGCGCAACTGACGCTGAAGCTCGAAAGTGCG
+GGTATCGAACAGGATTAGATACCCTGGTAGTCCGCACGGTAAGCGAGTGGATCGCCCGCGTGGTTCGGCCTAGAAGTACG
+GTACAGGCCGGACCAAGGAC
+>273850
+TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGTAGCCGGGAAGGCAAGTCAGATGTGAAATCCA
+CGGGCTCAACTCGTGAACTGCATTTGAAACTGTTTTTCTTGAGTATCGGAGAGGCAATCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACATCAGTGGCGAAGGCGGCTTACTGGGCTTTAACTGACGCTGAGGCTCGAAAGCGTG
+GGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGATTACTAGGTGTGGGGGGACTGACCCCTTCCGT
+GCCGCAGCAAACGCAATAAGTAATCCACCTGGGGAGTACGACCGCAAGGTTGAAACTCAAAGGAATTGACGGGGGCCCGC
+ACAAGCAGTGGAGTATGTGGATTTAATTCGAAGCAACGCGAAGAACCTTACCAGGTCTTAGACATCGTATGCATACCTCA
+GAGATAGAGTAGAAATCT
+>16072
+TCCGTCCATTGCATGAATCTGGAAATAATATCGGTCGCTTCACTGGGGGGATTGATATCATAAAGAAAGTCTCGGATGCC
+ATCTATAATCACCAGTCCGAGATCGGGGATTGTGCCGATAGCCTGTTCGACTATAGCCAGACGTATCGGAGGGGCATACT
+TGCGCAAAGCCAGCATAATCAAATTATCTGGATTCTTGTCCTCCGGGAGTCCGGCAAGGCGCAGGATTCTTTTCAATACC
+TTCTGGCAATGATGGCGTCCCTGTTCTGTATCAATGTACAGAACAGTTCGCTTATCCTCTGGAAATGATGAACGGTATTT
+CAGCACTGTTCCGTTTTTCTAATGCTGACGCTGCAATGGCTGTAACGTTAAAAGTTTTCTTACTTTTGGCTTTGCCTATA
+GAGGCACTGAAGTTCCCCAGTGTACCGATTGCGGCATCATCTACCATCAATACAACCGGT
+>183985
+TACGGAGGGTGCAAGCGTTGTCCGGAATCATTGGGCGTAAAGAGTTCGTAGGCGGCATGTAAAGTCAGGTGTTAAAGGCT
+GAGGCTCAACCTCAGTATGGCACTTGATACTTGCAAGCTAGAATGCGGTAGAGGTAAAGGGAATTCCAGGTGTAGCGGTG
+GAATGCGTAGATATCTGGAAGAACACCAAAGGCGAAGGCAGTCTCCTGGGCAAATCTGACGCTCATATGCGAAAGCGTGG
+GTAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGTCAATTAGGAGCTTGGGCAATAGTCTGGGTTCC
+GCAGCTAACGCAATAAA
+>214380
+TACGGAAGGGTCCGGGACGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCCGAAGATTAGGCGTGTTGTGAAATG
+TAGACGCTCAACGTCTGCACTGCAGCGCGAACTGGTTTCCTTGAGTACGCACAAAGTGGGCGGAATTCGTGGTGTAGCGG
+TGAAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTCACTGGAGCGCAACTGACGCTGAAGGCTCGAAAGT
+GCGGGTATCGAACAGGA
+>262658
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCATGGCAAGTCTGAAGTGAAATGCG
+GGGGCTCAACCCCTGAACTGCTTTGGAAACTGTCAGGCTGGAGTGCAGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACTGTAACTGACGTTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGATTACTAGGTGTTGGGGATTTATAAATCCCGG
+TGCCGTCGCAAACGCAATAAGTAATCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGGACCCG
+CACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCTGGTCTTGACATCCCGATGCATAACGGG
+TAATGCCGTTCGTACTTCGGTACATTGGAGACAGGTGGTGC
+>197095
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGGCGTAGGTGGCAAGGCAAGCCAGAAGTGAAACCCGG
+GGACTCAACCGCGGGATTGCTTTTGGAACTGTCATGCTAGAGTGCAGGAGGGGTGAGCGGAATTCCTAGTGTAGCGGTGA
+AATGCGTAGATATTAGGAGGAACACCGGAGGCGAAGGCGGCTCACTGGACTGTAACTGACACTGAGGCTCGAAAGCGTGG
+GGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCGGTAAACGATGAATACTAGATGTCGGTAGCAAAGCTACTCGGTG
+TCGTCGCAAAACGCAATAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGACCGCAC
+AAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTA
+>296377
+TACGTAGGTGGCGAGCGTTGTCCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGATATTTAAGTCAGATGTGAAAACCC
+CGGGCTCAACTTGGGGACTGCATTTGATACTGGATATCTTGAGGACAGGAGAGGAAAGCGGAATTCCTAGTGTAGCGGTG
+AAGTGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTTCTGGACTGTAACTGACGCTGAGGCGCGAAAGCGTG
+GGAGCGAACGGGATTAGATACCCCGGTAGTCCACGCTGTAAACGATGGGTACTAGGTGTAGGAGGTATCGACCCTTCTGT
+GCCGGAGTTAACGCAATAAGTACCCGCCTGGGGAGTACGGCCGCAAGGTTGAAACTCAAAGGAATTGACGGGGACCCGCA
+CAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAGGGCTTGACATCTTCCGAAAAGCATAGAG
+ATATGTAATGTGTCCCTTCGGGGATAACGGAAAGACAGGTGGTGCA
+>25695
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGTTTTGCAAGTCTGAAGTGAAAGCCC
+GGGGCTTAACCCCGGGACTGCTTTGGAAACTGTAGGACTAGAGTGCAGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACTGTAACTGACGTTGAGGCT
+>556462
+GCACCACTTCACGTTGAGCGATAGGGATAACTTTCTCCTCCGCTTCCGGACGCTGCGGCTGTCGGGGCAGCTTTGTGCTA
+CGGACAGGATTGATTGGTGCCAGAGACATTTCTACGGCATAGTCGTACAGCTGGATCAGGATACTGCGCACCGAAGAAGT
+GGTGCGACGAGAGAGCTGCTTTTCCGCTTGCAGATAATAGAGGAACGTCTGGATGCGGGAGGCTGTCAACTCGGCCACAG
+GGATGTCTCCCAACGCGGGAACGATATGTAACCGTTCGCAGGAATAATACAGCTCCATGGTGCGGCTGCAAACCTCATAG
+GTTTTGAACGTGGTGAGCCAGCGGTGCAGAAAGTCCTTCAGCAGCAATTCAGATGCCAGCGAAGGGGCGGCCTTGGATGT
+TTCGGAAACCAGCGTCAGGGCGATCCAGTGCGCCGCTTCCTCA
+>350731
+TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGCAGCCGGGTCTGCAAGTCAGATGTGAAATCCC
+GATGGGCCTCAACCCATGAACTGCATTTGAAACTGTAGATCGTTGAGTGTTCGGAGGGGCAATCGGAATTACCTAGTGTA
+GCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGATTGCTGGACGATAACTGACGGTGA
+>590547
+TACGGAAGGTCCGGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCCGTGAGGTAAGCGTGTTGTGAAATGTA
+GGCGCCCAACGTCTGCACTGCAGCGCGAACTGCCCCACTTGAGTGCGCGCAACGCCGGCGGAACTCGTCGTGTAGCGGTG
+AAATGCTTAGATATGACGAA
+>322112
+TACGTAGGTGGGCGAGCGTTGTCCGGATTTACTGGGCGTAAAGGGAGCGTAGGCGGATTTTTAAGTGAGATGTGAAATAC
+TCGGGCTTAACCCGTGAGTCGCTGCTATTTCAAACTGGAAGTCTAGAGTGCAGGAGAGGAGAAGGGAATTCCTAGTGTAG
+CGGTGAAATGCGTAGAGATTAGGAAGAACACCAGTGGCGAAGGCGCTTCTCTGGACTGTAACTGACGCTGAGACTCGAAA
+GCGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTAGGGGTTGTCAA
+>36624
+GATACTGACACAGGTCATTGCTGCCGATGGAGGCAAAATCCACCTGCTTTGCCAGCTGGTCCGCCATCATCACCGCCGCC
+GGGGTTTCTATCATAACGCCAAGCTTAATATGATCACTCACTGCCACACCGCTTTGCCACAGGCGCTCTTTTTCTTCCAG
+TACAATTATACGCGCTTTTGTAAAGTCTTCAAGGCCAGAGACCATGGGGAACATCAGCCATAGCGTTC
+>350732
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGAGTGGCAAGTCTGATGTGAAAACCC
+GGGGCTCAACCCCGGGACTGCATTGGAAACTGTCAATCTAGAGTACCGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGGTAACTGACGTTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGACTACTAGGTGTCGGGCAGCAAAGCTGTTCGG
+TGCCGCAGCAAACGCAATAAGTAGTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGGACCCG
+CACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCTGCTCTTGACATCTCCCTGACCGGCAAG
+TAATGTTGCCTTTTCCTTCGGGACA
+>193230
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGCGGTGCGGCAAGTCTGATGTGAAACGGC
+CCGGGCTCCGAACCCCGGTACTGTCAATTGGAAACTGTCGTACTAGAGTGTCGGAGGGTAAGCGGAATTCCTAGTGTAGC
+GGTGAATGCGTAGATATTAGGAGGAACACC
+>336539
+TACGTAGGGGGCGAGCGTTGTCCGGAATCACTGGGCGTAAAGGGCGCGTAGGCGGTTTGTTAAGTCAGATGTGAAAGGTG
+AGGGCTCAACCCTTAGAATGCATCTAATACTGGCAGACTTGAGTACAGAAGAGGAAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTTCTGGGCTGAAACTGACGCTGAGGCGCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGGGTACTAGGTGTCGGGAGTATCGATACTTTCG
+GTGCCGGAGTAAACACAATAACGTACCCCGCCGTGGGGAGTACGATCGCAAGATTGAAACTC
+>192899
+TACGTAGGTTGCAAGCGTTGTCCGGATTTACTGGGTGTGAAGGGCGTGTAGGCGGAGATGCAAGTTGGGAGTGAAATCCA
+TGGGCTCAACCCATGAACTGCTCTCAAAACTGTATCCCTTGAGTATCGGAGAGGCAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTGCTGGACGACAACTGACGCTGAGGCGCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATGAATACTAGGTGTGGGAGGACTGACCCCTTCCG
+TGCCGCAGTTAACACAATAAGTATTCCACCTGGGGATACGACCGCAAGGTTGAAACTCAAAGGAATTGACGGGGGCCGCA
+CGAAGCAGTGGATTATGTGGTTTAATTCGACGTCAACGCGAAGACCTTACAGGACTTGACATCCAACTAACGAAGTAGAG
+ATACATCAGGTGCCCTT
+>182728
+TACGTAGGGAGCGAGCGTTATCCGGATTTATTGGGTGTAAAGGGTGCGTAGACGGGAAGTCAAGTTAGTTGTGAAATCCC
+TCGGCTTAACTGAGGAACTGCAACTAAAACTGATTTTCTTGAGTACTGGAGAGGAAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCGGTGGCGAAGGCGACTTTCTGGACAGAAACTGACGTTGAGGCACGAAAGTGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACACTGTAAACGATGAATGCTAGGTGTAGGGGGTATCGACCCCTTCT
+GTGCCGCAGTCAACACAATAAGCATTCCGCCTGGGGAGTACGGCCGCAAGGTTGAAACTCAAAGGAATTGACGGGGGCCC
+GCACAAGCAGCGGAGCATGTGGTTTAATTCGACGCAACGCGAAGACCTTACCAGGTCTTGACATCCACTTAAACTTACAG
+AGATGTAAGGTGTGCTTGCACAAAGTGAGACAGGTGGT
+>535209
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGTGTAGGTGGCTGTGCAAGTCAGGAGTGAAAGCCC
+GGGGCTCAACCCCGGGACTGCTCTTGAAACTGTGCGGCTTGAGTGCAGGAGGGGCAGGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCGGTGGCGAAGGCGGCCTGCTGGACTGTAACTGACACTGAGGCTCGAAAGCGTG
+GGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGGCTCATAAGAGCTTCGG
+TGCCGCAGCAAACGCAATAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGGGCCCG
+CACAAGCAGCGGAGCATGTGGTTAATTCGACGCAACGCGAAGAACCTTACCAGGTCTTGACATCCACTTAAACTTACAGA
+GATGTAAGGTGTGCTTGC
+>36647
+TACGTAGGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGTGTGGCAAGTCTGATGTGAAAGGC
+ATGGGCTCAACCCCGGTACTGCATTGGAAACTGTCGTACTAGAGTGTCGGAGGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGATAACTGACGCTGAGGCT
+>305437
+AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGACTGGCAAGTTGGAAGTGAAAACTA
+TGGGCTCAACCCATAAATTGCTTTCAAAACTGCTGGCCTTGAGTAGTGCAGAGGTAGGTGGAATTCCCGGTGTAGCGGTG
+GAATGCGTAGATATCGGGAGGAACACC
+>541602
+CGCCGCATCGGGGAGATATTCTCTGAGGCCTCGGGCTATACGCTTGAGGAGACGCCGTTTGCCTCAGGCTTTGCCGGGTA
+CAAGGACTGGTTCATCGAGAGCTTTGACCGCCCCGGCTACACGATCGAAGCCGGGCTCGGCACAAATCCCCTGCCGGCGT
+ATCAGTTCCCGGATATCTATGAGCGCTGCCTCGGAATCCTCGTCTACGGCGCACTTGTCACCTGATAATAGAACTCCCCG
+GCGGGTCATCCTCCGGGGAGTTGGAATTTTATGGAGCTCAGGCAGGCTCCGCGCCCCATGAATATCGACGCGCGCGGCGC
+GCCGCCCGTCGCGGAGAGGACGCAAAGGCCGACACGACGAGGATGATCGGCA
+>329313
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCGGACGCTTAAGTCAGTTGTGAAAGTTT
+GCGGCTCAACCGTAAAATTGCAGTTGATACTGGGTGTCTTTGAGTACAGTAGAGGCAGGCGGAATTCGTGGTGTAGCGGT
+GAAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTTGCTGGACTGTAACTGACGCTGGTGACTCGAAAGGT
+GTGGGTAATCG
+>193343
+TACGTAGGGGGCGAGCGTTGTCCGGAATTACTGGGCGTAAAGGGCGCGTAGGCGGCATATTAAGTTAGATGTGAAATTCC
+CGGGCTTAACCTGGGCGTTGCATTTAAAACTGATAAGCTTGAGTGCCGGAGAGGAAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACATCAGTGGCGAAGGCGGCTTTCTGGACGGTAACTGACGCTGAGGCGCGAAAGCGTG
+GGTAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTAGGTGGTATCGACTCCATCT
+GTGCCGCAGCAAACGCAATAAGTATTCCGCCTGAGGAGTACGGCCGCAAGGTTGAAACTCAAAGGAATTGGACGGGGGCC
+CGCACAAGCAGCGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACC
+>333279
+TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGCAGCCGGGAATGCAAGTCAGATGTGAAATCCA
+TGGGCTTAACCCATGAACTGCATTTGAAACTGTATTTCTTGAGTACTGGAGAGGCAATCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGATTGCTGGACAGCAACTGACGGTGAGGCGCGAAAGTGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACACTGTAAACGATGACTACTAGGTGTGCGGGGACTGACCCCCTGCG
+TGCCGCAGTTACACAAT
+>291764
+TACGTAGGTGGCGAGCGTTATCCGGAATCATTGGGCGTAAAGAGGGAGCAGGCGGCCGCAAAGGGTCTGTGGTGAAAGAC
+CGAAGCTAAACTTCGGTAAGCCATGGAAACCGGGCGGCTGGAGTGCGGAAGAGGATCGTGGAATTCCATGTGTAGCGGTG
+AAATGCGTAGATATATGGAGGAACACCCAGTGGCGAAGGCGACGGTCTGGGCCGCAACTGACGCTCATTCCCGAAAGCGT
+GGGGAGCAAAGTAGGATTAGATACCCTAGTAGTCCACGCCGTAAACGATGGTCACTAAGTGTCGGGGGTCCAAACCCCGG
+TGCTGCAGTCAACGCAATAGTGACCCGACCTGAGTAG
+>192952
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGCGGTCTGGCAAGTCCGATGTGAAAATCC
+GGGGCTCAACTCCGGAACTGCATTGGAAACTGTCAGACTAGAGTGTCGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGAGATTGGGAGGAACACCAGTGGCGAAGGCGACTTACTGGGCCGTAACTGACGCTGAGGCGCGAAAGCGTG
+GGAGCGAACAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATGAATGCTAGGTGTAGGGGGTATCGACCCCTCCTG
+TGCCGGAGTAAACGCAATAAGAATTCCGCCTGGGGAGTACGGCCGCAAGGTTGAAACTCAAAGGATTGACGGGGGCCCGC
+ACAAGCAGCGGAGCATGTTGTTTAATCGATGCAACGCGAAGAACCTTACCA
+>136044
+TACGTAGGTGGCAAGCGTTATCCGGAATTACTGGGTGTAAAGGGTGTGTAGGCGGGTTTGCAAGTCAGATGTGAAAATTA
+TGGGCTCAACCCATAACCTGCGTCTGAAACTACAGATCTTGAGAGTGGGAGAGGTAAATGGAATTCCCGGTGTAGCGGTG
+AAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGATTTACTGGACCACAACTGACGCTGAGACACGAAAGCGTG
+GGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATGAATACTAGGTGTGGGAGGACTGACCCCTTCCGT
+GCCGCAGTTAACACAATAAGTATTCCACCTGGGGATACGATCGCAAGATTGAAACTCAAAGGATTGACGGGGCCCGCACA
+AGCAGTGGATTATGTGGTTTAATTCGACGCAACGCGAAGAACCTTACCGGGATTTGACATCCTACTAACGAAGTAGAGAT
+ACATTA
+>276703
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCGGACGCTTAAGTCAGTTGTGAAAGTTT
+GCGGCTCAACCGTAAAATTGCAGTTGATACTGGGTGTCTTGAGTGCAGTTGAGGCAGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCCTGCTAAGCTGCAACTGACATTGAGGCTCGAAAGTGTG
+GGTATCAAACAGGATTAGATACCCTGGTAGTCCACACGGTAAACGATGAATACTCGCTGTTTGCGATATACGGCAAGCGG
+CCAAGCGAAAGCGTTAAGTATTCCACCTGGGGAGTACGCCCGGCAACGGTGAAACTCAAAGGAATTGACGGGGCCCGCAC
+AGCGGAGGACATGTGGTTAATTCGATGATACGCGAGGAACCTTACCGGGCTTAAATGCACTCGAATGATCCGGAACGGTT
+CAGCTAGCAATAGCGAGTGTGAAGTG
+>237324
+TACGTAGGGGGCAAGCGTTATCCGGAATTACTGGGTGTAAAGGGTGCGTAGGTGGCCAGGCAAGTCAGAAGTGAAAGCCC
+GGGGCTTAACTCCGGGACTGCTTTTGAAACTGTCAGGCTAGAGTGCAGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACATCAGTGGCGAAGGCGGCTTACTGGACTGAAACTGACACTGAGGCACGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGCCGTAGAGGCTTCGGT
+GCCGCAGCCAACGCAATAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGGCCCGCA
+CAAGCGGAGGAACATGTGGTTTAATTCGATGATACGCGAGGAACCGTTACCGGGCTTAAATTGCAGACGAATTACGAGGT
+AAACTTGTAAGCCGCAAGGCGTCTGTGAAGGTGCTG
+>326644
+TACGGAAGGTCCGGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGACCGTGAGGTAAGCGTGTTGTGAAATGT
+AGGCGCCCAACGTACTGGCACTCGCAGCGTCGAACTGCCCCACTTGAGTGCGCGCAACGCCGGCGGAACTCGTCGTGTAG
+CGGTGAAATGCTTAGATATGACGAAGAACCCCGATTGCGAAGGCAGCTGGCGGGAGCGTAACTGACGCTGAAGCTCGAAA
+GCGCGGGTATCGAACAGGATTAGATACCCTGGTAGTCCGCGCGGTAAACGATGGATGCCGCTGTGCGCGCCTGGCGTGCC
+GCGGCTAAAGC
+>228762
+TACGTAGGGAGCAAGCGTTATCCGGATTTATTGGGTGTAAAGGGTGCGTAGACGGGACAACAAGTTAGTTGTGAAATCCC
+TCGGCTTAACTGAGGAACTGCAACTAAAACTATTGTTCTTGAGTGTTGGAGAGGAAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCGGTGGCGAAGGCGACTTTCTGGACAATAACTGACGTTGAGGCACGAAAGTGTG
+GGGAGCAAACAGGGATTAGATACCCTGGTAGTCCACACCGTAAACGATGGATACTAGGTGTAGGAAATGATTTCATTTTC
+TGTGCCGTCGCAAACGCAATAAGTATCCCACCTGGGGAGTACGGCCGCAAGGTTGAAACTCAAAGGAATTGACGGGGGCC
+CGCGACCAAGCCAGTGGAGTAGTGTGGGTTTAATTCGAAGTCAACGCGAAGAACCTTACC
+>198528
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGCGGTGCGGCAAGTCTGATGTGAAAGCCC
+GGGGCTCAACCCCGGTACTGCATTGGAAACTGTCGTACTAGAGTGTCGGAGGGGTAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGATAACTGACGCTAGAGGCTACGAAGCGG
+TGGGAGACAAACGAGGTATTAAGACTACCCGTGGTACGTCCA
+>167365
+TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGACCATCAAGTCAGCGGTCAAAAGTC
+GGGGCTCAACCCCGTGAAGCCGTTGAAACTGGCGGTCTTGAGTGAGCGAGAAGTAGGCGGAATGCGTGGTGTAGCGGTGA
+AATGCATAGATATCACGCAGAACTCCGATTGCGAAGGCAGCCTGCCGGCGCTCAACTGACGCTGAGGCACGAAAGTGCGG
+GGATCGAACAGGATTAGATACCCTGGTAGTCCGCACAGTAAACGATGAATGCTAGCTGTCCGGTCCGAATGAGGACTGGG
+GTGGCACAGCGAAAGCGTTAAGCATTCCACCTGGGGAGTACGCCGGCAACGGTGAAACTCAAAGGAATTGACGGGGCCCG
+CACAGCGGAGAACATGTGGTTTAATTCGATGATAGCGAGGAACCTTACCCGGGCTCAAACGCCGCAGGAATC
+>239053
+AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAGGGGAGCGCAGGCGGGAAGACAAGTTGGAAAGTGAAAACC
+ATGGGGCTACAACCCCGATGAAACTTGCTTTCAAAAACTGTTTTTACTTGAGTAGTGCAGAGGTAGGATGGGAATTCCCC
+GGTGTAGCGGTGGAATGCGTAGATATCGGGAGGAAC
+>526081
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGGTGGCAAGGCAAGCCAGAAGTGAAAACCC
+GGGGCTCAACCGCGGGATTGCTTTTGGAACTGTCATGCTAGAGTGCAGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACATCAGTGGCGAAGGCGGCTTACTGGACTGAAACTGACACTGAGGCACGAAAGCGTG
+GGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGT
+>129783
+TACGTAGGTGGCAAGCGTTGTCCGGAATTATTGGGCGTAAAGCGCGCGCAGGCGGCTTTCTAAGTCCATCTTAAAAGTGC
+GGGGCTTAACCCCGTGATGGGATGGAAACTGGGAAGCTGGAGTATCGGAGAGGAAAGTGGAATTCCTAGTGTAGCGGTGA
+AATGCGTAGAGATTAGGAAGAACACCGGTGGCGAAGGCGACTTTCTGGACGACAACTGACGCTGAGGCGCGAAAGCGTGG
+GGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTAGGAGGTATCGACCCCTTCTG
+TGTCGGAGCTAACACAATAAGTATTCCGCCTGGGAAGTACGATCGCAAGATTAAAACTCAAAGGAATTGACGGGGGCCCC
+GCACAAAGCGGTGGGAGTAGTGGTTAGGTTTAA
+>198740
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGCGGTGCGGCAAGTCTGATGTGAAAGCCC
+GGGGCTCAACCCCGGGACTGCATTGGAAACTGTCGTACTTGAGTATCGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGATAACTGACGCTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGGGACATAGTCCTTCGG
+TGCCGCAGCAAACGACAATAAGTAATTCCACCTCGGGAGTAACGTTCGCAAGAAATGAAACTCAAAGGAATTGACGGGGA
+CCCGGCACAAGACGTGGAGCGATGTGTTTAATTCGAAGCAACGCGAAGAACCTTAACCAA
+>583500
+TACGTAGGGGGCGAGCGTTGTCCGGAATTATTGGGCGTAAAGGGCGCGTAGACGGCTGAGTAAGTTACTGGTGAAAGCCC
+AGCTTTTAAGGCTGGAATTGCCGGTAATACTGTTCAGCTTGAGTGCAGGAGAGGGAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTTCTGGACTGTAACTGACGTTGAGGCGCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGGATGCTAGGTGTGGGAGGTATCGACCCTTCCG
+TGCCGCAGTTAACGCAATAAGCATCCCGCCTGGGGAGTACGGCCGCAAGGTTGAAACTCAAAGGAATTGACGGGGGCCCG
+CACAAGCAGCGGAGCATGTGGTTTAATTCGACGCAACGCGAAGAACCTTACCAGGACTTGACATCTGATTAAGCTTTGTG
+GAAACACAAGGTCCCTTCGGGGGAATCAAGACAGGTGGTGC
+>333775
+TACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTAGAAAAGTCTGAAGTGAAAGGCA
+GTGGCTCAACCATTGTAGGCTTTGGAAACTGTTTAACTTGAGTGCAGAAGGGGAGAGTGGAATTCCATGTGTAGCGGTGA
+AATGCGTAGATATATGGAGGAACACCGGTGGCGAAAGCGGCTCTCTGGTCTGTAACTGACGCTGAGGCTCGAAAGCGTGG
+GGAGCGAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAGTGCTAGGTGTTAGGTCCTTTCCGGGACTTAG
+TGCCGCAGCTAACGCATTAAGCACTCCGCCTGGGGAGTACGACCGCAAGGTTGAAACTCAAAGGAATTGACGGGGCCGCG
+ACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAGGTCTTGACATCCCAGT
+>193576
+TACGTAGGGTGGCAAGCGTTGTCCGGATTTACTGGGTGGTAAAGGGCGCGTAGGCGGGATGGCAAGTCAGATAGTGAAAT
+CCATGGGCTCAACCCATGAACTGCATTTGAAACTGTCGTTCTTGAGTATCGGAGAGGCAAGCGGAATTCCTAAGTGTAGC
+GGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTGCTGGACGACAAACTG
+>292073
+TACGGAAGGTCCGGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCCGGAGATTAAGCGTGTTGTGAATGTAG
+ATGCTCAACATCTGCACTGCAGCGCGAACTGGTTTCCTTGAGTACGCACAAAGTGGGCGGAATTCGTGGTGTAGCGGTGA
+AATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTCACTGGAGCGCAACTGAGCCCCCTGAAGCTCGAAAGTG
+CGGGTATCGAACAGGATTAGATACCCTGGTAGTCCGCACGGTAAACGATGGATGCCGCTGTTGGTCTGAATAGGTCAGCG
+GCCAAGCGAAAGCATTAAGCATCCCACCTGGGGAGTACGCCGGCAACGGTGAAACTCAAAGGAATTGACGGGGACCCGCG
+ACAAGCGGTGGAACATGTGGTTTAATTCGAAGCAACGCGAAAAACTTACCAGGGCTTGACATCTGACGAATCTGGATGAA
+AGTTCGGAGTGCTCTTCGGAGAGCG
+>234421
+TACGGAAGGTCCGGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCCGGAGATTAAGCGTGTTGTGAAATGTG
+GACGCTCAACGTCTGCACTGCAGCGCGAACTGGTTTCCTTGAGTACGCACAAAGTGGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTCACTGGAGCGCAACTGACGCTGAAGCTCGAAAGTGCG
+GGTATCGAACAGGATTAGATACCCTGGTAGTCCGCACGGTAAACGATGGATCGCCCGGCTTGGTTGGTCTGAATAGGTCA
+GCGGCCAAGCGAAAGCATTAAGCACT
+>307569
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCGGACGCTTAAGTCAGTTGTGAAAGTTT
+GCGGCTCAACCGTAAAATTGCAGTTGATACTGGGTGTCTTGAGTACAGTAGAGGCAGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCTATTGCGAAGGCAGCTCACTAAACTGCAACTGACATTGAGGCTCGAAAGTGTG
+GGTATCAAACAGGATTAGATACCCTGGTAGTCCACACGGTAAACGATGAATACTCGCTGTTTGCGATATACAGCAAGCGG
+CCAAGCGAAAGCATTAAGTATTCCACCTGGGGAGTACGCCGGCAACGGTGAAACTCAAAGGAATTGACGGGGCCCGCACA
+AGCGGAGGAACATGTGGTTTAATCGATGATACGCGAGG
+>292556
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGTCAAGCAAGTCAGAAGTGAAAGGCT
+GGGGCTCAACCCCGGGACTGCTTTTGAAACTGTTTGACTAGAGTGCTGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACTGTAACTGACACTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGGCCCATAAGGGCTTCG
+GTGCCGCAGCAAACGCAAATAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGGACGGGGAC
+CCGCACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAAGAACCTTACCAAGTCTTGACATCCTTCTGACCGG
+ACAGTAATGTGTCC
+>345211
+AACGTAGGTCACAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAGAACAAGTTGGAAGTGAAATCCA
+TGGGCTCAACCCATGAACTGCTTTCAAAACTGTTTTTCTTGAGTAGTGCAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGATAACTGACGCTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAAACGATGAATACTAGGTGTTGGGAAGCATTGCTTCTCG
+GTGCCGTCGCAAACGCAGTAAGTATTCCACCTGGGGA
+>189485
+TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGCACGCCGAGTCAGCGGTGAAATTTC
+CGGGCTCAACCCGGAGTGTGCCGTTGAAACTGGCGAGCTAGAGTACACAAGAGGCGAGGCGGAATGCGTGGTGTAGCGGG
+TAGAAATGCATAGATATCA
+>514272
+TACGTATGGAGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGTGTAGGTGGCATCACAAGTCAGAAGTGAAAGCCC
+GGGGCTCAACCCCGGGACTGCTTTTGAAACTGTGGAGCTGGAGTGCAGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACATCAGTGGCGAAGGCGGCTTACTGGACTGAAACTGACACTGAGGCACGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGGCCGTAGAGGCTTCGG
+TGCCGCAGCCAACGCAGTAAGTATTCCACCGTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGATTGGACGGGACCCG
+CACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTA
+>277196
+TACGTAGGTGGCAAGCGTTGTCCGGAATTATTGGGCGTAAAGCGCGCGCAGGCGGCTTCCCAAGTCCCTCTTAAAAGTGC
+GGGGCTTAACCCCGTGATGGGAAGGAAACTGGGAAGCTGGAGTATCGGAGAGGAAAGTGGAATTCCTAGTGTAGCGGTGA
+AATGCGTAGAGATTAGGAAGAACACCGGTGGCGAAGGCGACTTTCTGGACGAAAACTGACGCTGAGGCGCGAAAGCGTGG
+GGAGCAAACAGGATTAGGATACCCTGGTAGTCCACGCCGTAAACGATGATTACTAGGTGTGGGGGACTGACCCTTCCGTG
+CCGCAGCAAACGCAATAAGTAATCCACCTGGGGAGTACGACCGCAAGGTTGAAACTCAAAGGAATTGACGGGGCCCGCAC
+AAGCAGTGGAGTATGTGGATTAAGTTCGAAGCAACGCGAAGAACCTTA
+>184403
+TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAACTCCAAGTCAGCGGTAAAAATTC
+GGGGCTCAACCCCGTCGTGCCGTTGAAACTGGAGTCCTTGAGTGGGCGAGAAGTATGCGGAATGCGTGGTGTAGCGGTGA
+AATGCATAGATATCACGCAGAACCCCGATTGCGAAGGCAGCATACCGGCGCCCAACTGACGCTGAAGCACGAAAGCGTGG
+GTATCGAACAGGATTAGATACCCTGGTAGTCCACGCAGTAAACGATGGATACTAGCTGTCCGGGGGGATTGACCCCTGGG
+TGGCACAGCGAAAGCGTTAAGTATCCCACCTGGGGAGTACGCCGGCAACGGTGAAACTCAAAGGAATTGACGGGGGCCGC
+ACAAGCGGAGGAACATGTGGTTTAATTCGA
+>295754
+TACGTAGGTGGCGAGCGTTATCCGGAATCATTGGGCGTAAAGAGGGAGCAGGCGGCCGCAAGGGTCTGTGGTGAAAGACC
+GAAGCTAAACTTCGGTAAGCCATGGAAACCGGGCGGCTAGAGTGCGGAAGAGGATCGTGGAATTCCATGTGTAGCGGTGA
+AATGCGTAGATATATGGAGGAACACCAGTGGCGAAGGCGACGGTCTGGGCCGCAACTGACGCTCATTCCCGAAAGCGTGG
+GGAGCAAATAGGATTAGATACCCTAGTAGTCCACGCCGTAAACGATGGTCACTAAGTGTCGGGGGTCAAACCCGGTCGCT
+CGCAGTCAACGACAACTAAGTAGACCCGCCCTCGAGTAGTACGTTCGTCAA
+>291710
+TACGTAGGGAGCGAGCGTTGTCCGGATTTACTGGGTGTAAAGGGTGCGTAGGCGGCCGAGCAAGTCAGTTGTGAAAACTA
+TGGGCTTAACCCATAACGTGCAATTGAAACTGTCCGGCTTGAGTGAAGTAGAGGTAGGCGGAATTCCCGGTGTAGCGGTG
+AAATGCGTAGAGATCGGGAGGAACACCGGTGGCGAAGGCGACTTTCTGGACAGAAACTGACGTTGAGGCACGAAAGTGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACACCGTAAACGATGGATACTAGGTGTAGGGGATATTAAAATTCTCT
+GTGCCGCCGCTAACGCAATAAGTATCCCACCTGGGGAGTACGGCCGCAAGGTTGAAACTCAAAGGAATTGACGGGGGCCC
+GCACAAGCAGTGGAGTATGTGGTTTAATTCGACGCAACGCGAAGAACCTTACCGGGGCTTGACATATAAGTGAATAAATA
+AAGAGATTAGTTAGCTCTTCGG
+>298252
+AACGTAGGTCACAAGCGTTGTCCGGAATTACTGGGGTGTAAAGGGAGCGCAGGGCGGGAAGACAAGTTGGAAGTGAAATC
+CATGGGCCCAACCATGAACTGTCTTTCTAAAACTTAGTTTTTCTTGAGTAGTGCAGAGGTAGGCGGAATTCCCGGTGTAG
+CGGTGGAATGCGTAGATA
+>346253
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGCGGTGCGGCAAGTCTGATGTGAAAGCCC
+GGGGCTCAACCCCGGTACTGCATTGGAAACTGTCGTACTAGAGTGTCGGAGGGGTAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACATCTGTGGCGAAGGCGACTTACTGGACGATTACTGACGCTGAGACACGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAACACTAGGTGTGGGGGGCGCAAGCCTCCGTG
+CCGCAGCTAACGCAATAAGTGTTCCACCTGGGAGTACGGCCGCAAGGTTGAAACTCAAAGGAATTGACGGGAGCCCGCAC
+AAGCAGTGGAGTATGTGGTTTAATTCGACGCAACGCGAAGAACCTTACCAGGACTTGACATCCCAAGAACGTCGGCGTAA
+TGGCTGATGTGCCCTTCGGGGGAGCGTTGGAGACA
+>212619
+TACGTAGGGTGCAAGCGTTATCCGGAATTATTGGGCGTAAAGGGCTCGTAGGCGGTTCGTCGCGTCCGGTGTGAAAGTCC
+ATCGCTTAACGGTGGATCCGCGGCCGGGGTACGGGCGGGCTTGAGTGGCGGGTAGGGGAGACTGGAACTTCCCGGTGTAA
+CGGTGGAATGTGTAGATAG
+>527741
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCACAGCAAGTCTGATGTGAAAGCCC
+GGGGCCCAACCCCGGAACTGCATTGGAAACTGCTGGGCTTGAGTGCAGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAGGGCGGCTTACTGGACTGTAACTGACGTTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCGTGGTACGTCCACGACCGTAAACGATGTATTACGTAGGTCGGGTACCGGGGAACCG
+>171551
+TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGTAGGCGGGAGAGCAAGTCAGAAGTGAAATCTA
+TGGGCTTAACCCATAAACTGCTTTTGAAACTGTTCTTCTTGAGTATCGGAGAGGCAGGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCCTGCTGGACGACAACTGACGCTGAGGCGCGAAAGGCGT
+GGGGAGCAAACAGGATTAGATACCCCGGTAGTCCACGACTGTAAACGATGAATACTAGGGTGTCGGGAGGACTGACCCCT
+TCGTGCCGCAGTTAACACAATAAGTATTCCACCTGGGGAGTACGATACGCAAAGATTA
+>579000
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGCGGTCTGGCAAGTCTGATGTGAAAATCC
+GGGGCTCAACTCCGGAACTGCATTGGAAACTGTCAGACTAGAGTGTCGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGATAACTGACGCTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTCGGGGGCACAAAAGTGCTTC
+GGTGCCGCAGCAAACGCATTAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGGACC
+CGCACAAGCGGTGGAGCATGTGGTTAATTCGAAGCAACGCGAAGAACCTTACCAAGTCTTGACATCCCTCTGACCCGACT
+CTTAACCGAGTTCTTTCC
+>563574
+AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAAACCA
+TGGGCTCAACCCATGAATTGCTTTCAAAACTGTTTTCTTGAGTAGTGCAGAGGTAGATGGAATTCCCGGTGTAGCGGTGG
+AATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGTCTACGTGGCACCAACTGACGCTGAGGCT
+>165257
+TACGTAGGTGGCGAGTGTTATCCGGAATCATTGGGCGTAAAGAGGGAGCAGGCGGCCGCAAGGGTCTGTGGTGAAAGACC
+GAAGCTAAACTTCGGTAAGCCATGGAAACCGGGCGGCTAGAGTGCGGAAGAGGATCGTGGAATTCCATGTGTAGCGGTGA
+AATGCGTAGATATATGGAGGAACACCAGTGGCGAAGGCGACGGTCTGGGCCGCAACTGACGCTCATTCCCGGAAGCGTGG
+GGAGCAAATAGGATTAGATACCCTAGTAGTCCACGCCGTAAACGATGGTCACTAAGTGTCGGGGTCAAACCCCCGGTGCT
+GCAGTCAACGCAATAAGTGACCCGCCTGAGTAGTACGTTCGCAAGAATGAAACTCTAAGAAGTGTACGGGGCCCGGCCGA
+CAAGCGGTCGGAGCGATAGTGGTTT
+>291610
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTGGACGGAGAGGCAAGTCTGATGTGAAAAACC
+CGGGGCTCAACCCCGGGACTGCATTGGAAACTTGTTTTTCTAGAGTGTCGGAGAGGTAAGTGGAATTCCTAGTGTAGCGG
+TGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGATGACTGACGTTGAGGCT
+>338272
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGGCGGTCTGACAAGTCAGAAAGTGAAAGCC
+CGGGCTCAACTCCGGGACTGCTTTTGAAACTGCCGGACTAGATTGCAGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTAC
+>237003
+TACGGAAGGTCCGGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCCGCAGGTTAAGCGTGTTGTGAAATGTA
+GGGGCTCAACCTCTGTACTGCAGCGCGAACTGGCTTGCTTGAGTACGCACAACGGTGGGCGGAATTCGTGGTGTAGCGGT
+GAAATGCTTAGATATCAACGAAGAACTCCGATTGCGAAGGCAGCTCGACGGGAGCG
+>207832
+TGCGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGAATGGCAAGTCTGATGTGAAAGGCC
+GGGGCTCAACCCCGGGACTGCATTGGAAACTGCCAATCTAGAGTACCGGAGGGGTAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGGTAACTGACGTTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATAGAATACTAGGTGTTGGGTAGCAAAGCTATTCG
+GTGCCGCAGCAAACGCAATAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGGAATTGACGGGGACCC
+GCACAAGCGGTGGAGCATTGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAAATCTTGACATCGATCCCGACCGGA
+CCGTAATCGGGTTCCTTTTCCCGTTCGGGG
+>31334
+TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGAGTGGCAAGTCTGATGTGAAAACCC
+GGGGCTCAACCCCGGGACTGCATTGGAAACTGTCAATCTAGTAGTACCGGAGAGGTAAAGCGGAATTCCTAGTGTAGCGG
+TGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGGTAACTGACGTTGAGGCTCGAAAGCG
+TGGGGAGCAAACAGGATTAGATAACCCTGGTAGTCCACGCCGTAAACGATGAATACTAGGTGTGGGAGGACTGACCCCTT
+CCGTGCCGCAGTTAACACAATAAGTATTCCACCTGGGGAGTACGACCGCAAGGTTGAAACTCAAAGGATTGACGGGGCCC
+GCACAAGCAGTGGATTATGGT
+>70528
+AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAAACCA
+TGGGCTCAACCCATGAATTGCTTTCAAAACTGTTTTTCTTGAGTAGTGCAGAGGTGATGGAATTCCCGGTGTAGCGGTGG
+ATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGTCTACTGGGCACC
+>566717
+CAGATAGTTGCGGGCGTAGCCGTCGGAGACCTCGTGGATCTCGTCCTTCTTGCCAATGCCCTTGATATCCTGCTTGAGAA
+TCACTTTCATGATGTTATCCCTCACTTCTCACCGGAAAACCGGTGCAGTTGTTGTGTTTATAGTATAGCGCATTTGTCGC
+TTTTGGGCAAGAGGAAGCACGCATTGCAGCATTTTCCTCAGGTGAACTCCCCCAGTCTGCTTCGCAGACAGCCCCCTCCT
+GAGGTGGGGCCGTTTGGCAAAACCGAAAGGCTTGTCCTTTTCGTCCAGAGGGCTCCTTCCTTTG
+>157866
+TACGGAGGATGCGAGCGTTATCCGGTATTTTATTAGGGTTTAAAGGGTGCGTAGGCGGACTGTCAAGTCAGCGGTAAAAT
+ACGGGGGCTCAACCTCCGCCCGCCGTTGAAACTGACGGTCTTGAGTGGGCGAGAAGTATGCGGAATGCGTGGTGTAGCGG
+TGAAATGCATAGATATCACGCAGAACTCCGATTGCGAAGGCAGCATACCGGCGCCC
+>154106
+TACGTAGGGGGGCGAGCGTTGTCCGGAATCACTGGGCGTAAAGGGCGCGTAGGCGGTTTAATAAGTCAGTGGTGAAAACT
+GAGGGCTCAACCCTCAGCCTGCCACTGATACTGTTAGACTTGAGTATGGAAGAGGAGAATGGAATTCCTAGTGTAGCGGT
+GAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGATTCTCTGGGCCAAGACTGACGCTGAGGCGCGAAAGCGT
+GGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAACGATGGATACTAGGTGTTAGTAGTTTCGATGCTACTA
+GTGCCGGAGTAAACACAATAAGTATCCCGCCTGGGGAGTACGGTCGCAAGACTGAAACTCAAAGGAATTGACGGGGGCCC
+GCGACAAGCGGTGGAGCATGTGGTTTAATTCG
+>561594
+TACGTAGGGGGCGAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCGTATCAAGTCTGATGTGAAAGGCA
+GGGGCTTAACCCCTGGACTGCATTGGAAACTGGTATGCTTGAGTGCCGGAGGGGTAAGCGGAATTCCTAGTGTAGCGATG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGGTAACTGACGTTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACACTGTAAACGATGAATGCTAGGTGTAGGGGTATCGACCCCTTCTG
+TGCCGCAGTCAACACAATAAGCATTCCGCCTGGGGAGTACGGCCGCAAGGTTGAACTCAAAGGAATTGACGGGGCCCGCA
+CAAGCAGCGGAGCATGTGGTTTAATTCGACGCAACGCGAAGACCTTACCAGGTCTTGACATCCACTTAAACTTAC
+>137557
+TACGGAAGGTCCGGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGCAGGCCGTCCTTTAAGCGTGCTTTGTGAAATG
+CCGCGGCTCAACCGTGGCACTGCAGCGCGAACTGGAGGACTTGAGTACGCACGAGGTAGGCGGAATTCGTGGTGTAGCGG
+TGAAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCCTGCTGGACTGTAACTGACGCTGATGCTCGAAAGTG
+TGGGTATCAAACAGGATTAGATACCCTGGTAGTCCACACAGTAAACGATGAATACTCGCTGTTTGCGATATACAGTAAGC
+GGCCAAGCGAAAGCGTTAAGTATTCCACCTGGGGAGTACGCCGGCAACGGTGAAACTCAAAGGAATTGACGGGGCCCGCA
+CAAGCGGAGGACATGTGGTTAATTCGATGATACGCGAGGAACCTTACCCGGGCTTGAATTGCAACTGAATGATGTGGAGA
+CATGTCAGCCGCAAGGCAG
+>115230
+GCTGTCGAAAGGCCGGGCGTTCCCGGTCAATGCCGGAGTAATCTTCATCACAATAAATATTGTAGATGTCCCAGCCCTTG
+TCCAGCGCGTACTTGATGAGCATGGATTTCTGGTTTTGGATGCTCTCGGACTCCGATTGCTTCTCCTCATCCTCCCGGCT
+CAGCCTGCAATAAATCGCCGCCGTCATGGCACTACCTCCTGCACAGTGTTCTGCCGCAACGGCAATACTGCCCAGGCACA
+CAGG
+>470527
+TACGGAAGGTTCGGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCTGGAGATTAAGCGTGTTGTGAAATGTA
+GATGCTCAACATCTGCACTGCAGCGCGAACTGGTTTCCTTGAGTACGCACAAAGTGGGCGGAATTCGTGGTGTAGCGGTG
+AAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTCACTGGAGCGCAACTGACACTGAAGCTACGAAGTGCG
+GGTATCGAACAGGATTAGACTACCCGTGGTACGTCCG
+>471308
+TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGCGTAGGCGGGATGGCAAGTCAGATGTGAAATCCA
+TGGGCTCAACCCATGAACTGCATTTGAAACTGTCGTTCTTGAGTATCGGAGAGGCAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTGCTGGACGACAACTGACGCTGAGGCGCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGGATGAATACTAGGTGTGGGGGGACTGACCCCTCCG
+TGCCGCAGTTAACACAATAAGTATT
+>470599
+TACGTAGGTGGCGAGCGTTATCCGGATTTATTGGGCGTAAAGGGTGCGTAGGCGGTATGTTAAGTAATAAAAATAAAAGC
+CCGAAGCTTTAACTTTCGGTTTCGTTTTAATAAACTGGCAAAACTAGAGTACAGTAGAGGCAAATGGAATTCCTAGTGTA
+GTAGGTAAAATGCGTAGATATTAGGAGGAACACCGGTGGCGAAGGCGATTTGCTGGGCTGTAACTGACGCTGAGGCAACG
+AAAGCGGTGGGGA
+>134207
+TACGGAAGGTCCGGGCGTTATCCGGATTTATTGGGTTTAAAGGGCGTGTAGCCGGGAGGGCAAGTCAGATGTGAAATCCA
+CGGGCTCAACTCGTGAACTGCATTTGAAACTACTCTTCTTGAGTATCGGAGAGGCAATCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGATTGCTGGACGACAACTGACGGTGAGGCGCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATGAATACTAGGTGTGCGGGGACTGACCCCCTGCG
+TGCCGCAGCTAACGCAATAGTATTCCACCTGGGGAGTACGATCGCAAGGTTGAAACTCAAAGGAATTGACGGGGGCCCGC
+GACAAGCGGTGGATTATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCGAGGGCTTGACATCCTACTAACGAAGTA
+GAGATACATCAGGTCGCCCGTTCGGGGAAAGTAGAGAC
+>258375
+TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGTAGCCGGAGAGACAAGTCAGATGTGAAATCCG
+CGGGCTCAACTCGCGAACTGCATTTGAAACTGTTTCCCTTGAGTATCGGAGAGGTAACCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAAGAACACCAGTGGCGAAGGCGGGTTACTGGACGACAACTGACGGTGAGGCGCGAAAGCGTG
+GGGAGACAAACAGGATTAGATACCCTGGTAGTCCACGCTAGTAAACGATACAATACTAGGTGTGGCGGGGACTCGACCCC
+CTGCGTCGCC
+>585355
+TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGTAGGCGGGATTACAAGTCAGATGTGAAATACC
+GGGGCTTAACTCCGGGGCTGCATTTGAAACTGTAGTTCTTGAGTGCCGGAGAGGAAAGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTTCTGGACGGTAACTGACGCTGAGGCGCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGGATACTAGGTGTAGGAGGTATCGACCCCTTCT
+GTGCCGGAGTAAACACAATAAGTATCCCACCTGGGGAGTACGACCGCAAGGTTGAAACTCAAAGGAATTGACGGGGGCCC
+GCACAAGCAGTGGAGTATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAGGACTTGACATCCCGCGCATAGTATA
+GAGATATATGAAATCCTTCGGGACGC
+>26036
+TACGTAGGGAGCGAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGTAGGCGGGAAAGCAAGTTGGAAGTGAAATGCA
+TGGGCTCAACCCATGAGCTGCTTTCAAAACTGTTTTTCTTGAGTGAAGTAGAGGCAGGCGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCCTGCTGGGCTTTAACTGACGCTGAGGCTCGAAAGCGTG
+GGTAGCAAACAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATGATTACTAGGTGTGGGGGATTTCGGTCCTCCGT
+GCCGGAGCAAACGCAATAAGTAATCCACCTGGGGAGTACGGCCGCAAGGCTGAAACTCAAAGGAATTGACGGGGGCCCGC
+ACAAGCAGTGGATTATGTGGTTTAATTCGAAGACAACGCGAAGAACCTTACCAAGTCTTGACATCCTTCTGACCGGTACT
+TAACCGTACCTTCTCTCGGA
+>258502
+TACGTAGGGAGCGAGCGTTGTCCGGAATTACTGGGCGTAAAGGGCGCGTAGGCGGGCCTGTAAGTTGTGTGTGAAATACC
+CGGGCTTAACCTGGGGGGTGCATACAAAACTGTGGGTCTTGAGTGCGGTAGAGGAAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGGCCGTAACTGACGCTGAGGCGCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATGCTAGGTGTAGGGGGTATCGACCCCTTCT
+GTGCCGCAGCAAACGCAATAAGCATTCCGCCT
+>13662
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGAATGGCAAGTCTGATGTGAAAGGCC
+GGGGCTCAACCCCGGGACTGCATTGGAAACTGCCAATCTAGAGTACCGGAGGGGTAAGTGGAATTCCTAGTGTAGCGGTG
+AAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGGTAACTGACGTTGAGGCTCGAAAGCGTG
+GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCTGTAAACGATGAATACTAGGTGTTGGGTAGCAAAGCTATTCGG
+TGCCGCAGCAAACGCAATAAGTATTCCACCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAAGGAATTGGACGGGGACC
+CGCACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGACCTTACAAATCTTGACATCGATCCGACCGGACCG
+TAA
diff --git a/inst/extdata/rformat_dist_0.03.txt.gz b/inst/extdata/rformat_dist_0.03.txt.gz
new file mode 100644
index 0000000..7068f22
Binary files /dev/null and b/inst/extdata/rformat_dist_0.03.txt.gz differ
diff --git a/inst/extdata/rich_dense_otu_table.biom b/inst/extdata/rich_dense_otu_table.biom
new file mode 100644
index 0000000..2cac771
--- /dev/null
+++ b/inst/extdata/rich_dense_otu_table.biom
@@ -0,0 +1,56 @@
+{
+ "id":null,
+ "format": "Biological Observation Matrix 1.0.0-dev",
+ "format_url": "http://biom-format.org",
+ "type": "OTU table",
+ "generated_by": "QIIME revision XYZ",
+ "date": "2011-12-19T19:00:00",
+ "rows":[
+ {"id":"GG_OTU_1", "metadata":{"taxonomy":["k__Bacteria", "p__Proteobacteria", "c__Gammaproteobacteria", "o__Enterobacteriales", "f__Enterobacteriaceae", "g__Escherichia", "s__"]}},
+ {"id":"GG_OTU_2", "metadata":{"taxonomy":["k__Bacteria", "p__Cyanobacteria", "c__Nostocophycideae", "o__Nostocales", "f__Nostocaceae", "g__Dolichospermum", "s__"]}},
+ {"id":"GG_OTU_3", "metadata":{"taxonomy":["k__Archaea", "p__Euryarchaeota", "c__Methanomicrobia", "o__Methanosarcinales", "f__Methanosarcinaceae", "g__Methanosarcina", "s__"]}},
+ {"id":"GG_OTU_4", "metadata":{"taxonomy":["k__Bacteria", "p__Firmicutes", "c__Clostridia", "o__Halanaerobiales", "f__Halanaerobiaceae", "g__Halanaerobium", "s__Halanaerobiumsaccharolyticum"]}},
+ {"id":"GG_OTU_5", "metadata":{"taxonomy":["k__Bacteria", "p__Proteobacteria", "c__Gammaproteobacteria", "o__Enterobacteriales", "f__Enterobacteriaceae", "g__Escherichia", "s__"]}}
+ ],
+ "columns":[
+ {"id":"Sample1", "metadata":{
+ "BarcodeSequence":"CGCTTATCGAGA",
+ "LinkerPrimerSequence":"CATGCTGCCTCCCGTAGGAGT",
+ "BODY_SITE":"gut",
+ "Description":"human gut"}},
+ {"id":"Sample2", "metadata":{
+ "BarcodeSequence":"CATACCAGTAGC",
+ "LinkerPrimerSequence":"CATGCTGCCTCCCGTAGGAGT",
+ "BODY_SITE":"gut",
+ "Description":"human gut"}},
+ {"id":"Sample3", "metadata":{
+ "BarcodeSequence":"CTCTCTACCTGT",
+ "LinkerPrimerSequence":"CATGCTGCCTCCCGTAGGAGT",
+ "BODY_SITE":"gut",
+ "Description":"human gut"}},
+ {"id":"Sample4", "metadata":{
+ "BarcodeSequence":"CTCTCGGCCTGT",
+ "LinkerPrimerSequence":"CATGCTGCCTCCCGTAGGAGT",
+ "BODY_SITE":"skin",
+ "Description":"human skin"}},
+ {"id":"Sample5", "metadata":{
+ "BarcodeSequence":"CTCTCTACCAAT",
+ "LinkerPrimerSequence":"CATGCTGCCTCCCGTAGGAGT",
+ "BODY_SITE":"skin",
+ "Description":"human skin"}},
+ {"id":"Sample6", "metadata":{
+ "BarcodeSequence":"CTAACTACCAAT",
+ "LinkerPrimerSequence":"CATGCTGCCTCCCGTAGGAGT",
+ "BODY_SITE":"skin",
+ "Description":"human skin"}}
+ ],
+ "matrix_type": "dense",
+ "matrix_element_type": "int",
+ "shape": [5,6],
+ "data": [[0,0,1,0,0,0],
+ [5,1,0,2,3,1],
+ [0,0,1,4,2,0],
+ [2,1,1,0,0,1],
+ [0,1,1,0,0,0]]
+ }
+
diff --git a/inst/extdata/rich_sparse_otu_table.biom b/inst/extdata/rich_sparse_otu_table.biom
new file mode 100644
index 0000000..17d9e26
--- /dev/null
+++ b/inst/extdata/rich_sparse_otu_table.biom
@@ -0,0 +1,66 @@
+{
+ "id":null,
+ "format": "Biological Observation Matrix 1.0.0-dev",
+ "format_url": "http://biom-format.org",
+ "type": "OTU table",
+ "generated_by": "QIIME revision XYZ",
+ "date": "2011-12-19T19:00:00",
+ "rows":[
+ {"id":"GG_OTU_1", "metadata":{"taxonomy":["k__Bacteria", "p__Proteobacteria", "c__Gammaproteobacteria", "o__Enterobacteriales", "f__Enterobacteriaceae", "g__Escherichia", "s__"]}},
+ {"id":"GG_OTU_2", "metadata":{"taxonomy":["k__Bacteria", "p__Cyanobacteria", "c__Nostocophycideae", "o__Nostocales", "f__Nostocaceae", "g__Dolichospermum", "s__"]}},
+ {"id":"GG_OTU_3", "metadata":{"taxonomy":["k__Archaea", "p__Euryarchaeota", "c__Methanomicrobia", "o__Methanosarcinales", "f__Methanosarcinaceae", "g__Methanosarcina", "s__"]}},
+ {"id":"GG_OTU_4", "metadata":{"taxonomy":["k__Bacteria", "p__Firmicutes", "c__Clostridia", "o__Halanaerobiales", "f__Halanaerobiaceae", "g__Halanaerobium", "s__Halanaerobiumsaccharolyticum"]}},
+ {"id":"GG_OTU_5", "metadata":{"taxonomy":["k__Bacteria", "p__Proteobacteria", "c__Gammaproteobacteria", "o__Enterobacteriales", "f__Enterobacteriaceae", "g__Escherichia", "s__"]}}
+ ],
+ "columns":[
+ {"id":"Sample1", "metadata":{
+ "BarcodeSequence":"CGCTTATCGAGA",
+ "LinkerPrimerSequence":"CATGCTGCCTCCCGTAGGAGT",
+ "BODY_SITE":"gut",
+ "Description":"human gut"}},
+ {"id":"Sample2", "metadata":{
+ "BarcodeSequence":"CATACCAGTAGC",
+ "LinkerPrimerSequence":"CATGCTGCCTCCCGTAGGAGT",
+ "BODY_SITE":"gut",
+ "Description":"human gut"}},
+ {"id":"Sample3", "metadata":{
+ "BarcodeSequence":"CTCTCTACCTGT",
+ "LinkerPrimerSequence":"CATGCTGCCTCCCGTAGGAGT",
+ "BODY_SITE":"gut",
+ "Description":"human gut"}},
+ {"id":"Sample4", "metadata":{
+ "BarcodeSequence":"CTCTCGGCCTGT",
+ "LinkerPrimerSequence":"CATGCTGCCTCCCGTAGGAGT",
+ "BODY_SITE":"skin",
+ "Description":"human skin"}},
+ {"id":"Sample5", "metadata":{
+ "BarcodeSequence":"CTCTCTACCAAT",
+ "LinkerPrimerSequence":"CATGCTGCCTCCCGTAGGAGT",
+ "BODY_SITE":"skin",
+ "Description":"human skin"}},
+ {"id":"Sample6", "metadata":{
+ "BarcodeSequence":"CTAACTACCAAT",
+ "LinkerPrimerSequence":"CATGCTGCCTCCCGTAGGAGT",
+ "BODY_SITE":"skin",
+ "Description":"human skin"}}
+ ],
+ "matrix_type": "sparse",
+ "matrix_element_type": "int",
+ "shape": [5, 6],
+ "data":[[0,2,1],
+ [1,0,5],
+ [1,1,1],
+ [1,3,2],
+ [1,4,3],
+ [1,5,1],
+ [2,2,1],
+ [2,3,4],
+ [2,5,2],
+ [3,0,2],
+ [3,1,1],
+ [3,2,1],
+ [3,5,1],
+ [4,1,1],
+ [4,2,1]
+ ]
+ }
diff --git a/inst/extdata/study_1457_split_library_seqs_and_mapping.zip b/inst/extdata/study_1457_split_library_seqs_and_mapping.zip
new file mode 100644
index 0000000..8d9b2f6
Binary files /dev/null and b/inst/extdata/study_1457_split_library_seqs_and_mapping.zip differ
diff --git a/inst/extdata/study_816_split_library_seqs_and_mapping.tar.gz b/inst/extdata/study_816_split_library_seqs_and_mapping.tar.gz
new file mode 100644
index 0000000..4ce0aa7
Binary files /dev/null and b/inst/extdata/study_816_split_library_seqs_and_mapping.tar.gz differ
diff --git a/inst/extdata/study_816_split_library_seqs_and_mapping.zip b/inst/extdata/study_816_split_library_seqs_and_mapping.zip
new file mode 100644
index 0000000..f103350
Binary files /dev/null and b/inst/extdata/study_816_split_library_seqs_and_mapping.zip differ
diff --git a/inst/extdata/study_gp.txt b/inst/extdata/study_gp.txt
new file mode 100644
index 0000000..87fd619
--- /dev/null
+++ b/inst/extdata/study_gp.txt
@@ -0,0 +1,29 @@
+#SampleID Primer Final_Barcode Barcode_truncated_plus_T Barcode_full_length SampleType Description
+CL3 ILBC_01 AACGCA TGCGTT CTAGCGTGCGT Soil "Calhoun South Carolina Pine soil, pH 4.9"
+CC1 ILBC_02 AACTCG CGAGTT CATCGACGAGT Soil "Cedar Creek Minnesota, grassland, pH 6.1"
+SV1 ILBC_03 AACTGT ACAGTT GTACGCACAGT Soil "Sevilleta new Mexico, desert scrub, pH 8.3"
+M31Fcsw ILBC_04 AAGAGA TCTCTT TCGACATCTCT Feces "M3, Day 1, fecal swab, whole body study"
+M11Fcsw ILBC_05 AAGCTG CAGCTT CGACTGCAGCT Feces "M1, Day 1, fecal swab, whole body study "
+F11Fcsw ILBC_06 AATCAG CTGATT ACGAGACTGAT Feces "F1, Day 1, fecal swab, whole body study "
+M31Plmr ILBC_07 AATCGT ACGATT CGAGTCACGAT Skin "M3, Day 1, right palm, whole body study"
+M11Plmr ILBC_08 ACACAC GTGTGT GCCATAGTGTG Skin "M1, Day 1, right palm, whole body study "
+F21Plmr ILBC_09 ACACAT ATGTGT GTAGACATGTG Skin "F1, Day 1, right palm, whole body study "
+M31Tong ILBC_10 ACACGA TCGTGT TGTGGCTCGTG Tongue "M3, Day 1, tongue, whole body study "
+M11Tong ILBC_11 ACACGG CCGTGT TAGACACCGTG Tongue "M1, Day 1, tongue, whole body study "
+F11Tong ILBC_12 ACACTA TAGTGT CGGATCTAGTG Tongue "F1, Day 1, tongue, whole body study "
+LMEpi24M ILBC_13 ACACTG CAGTGT CATGAACAGTG Freshwater "Lake Mendota Minnesota, 24 meter epilimnion "
+SLEpi20M ILBC_15 ACAGAG CTCTGT AGCCGACTCTG Freshwater "Sparkling Lake Wisconsin, 20 meter eplimnion"
+AQC1cm ILBC_16 ACAGCA TGCTGT GACCACTGCTG Freshwater (creek) "Allequash Creek, 0-1cm depth"
+AQC4cm ILBC_17 ACAGCT AGCTGT CAAGCTAGCTG Freshwater (creek) "Allequash Creek, 3-4 cm depth"
+AQC7cm ILBC_18 ACAGTG CACTGT ATGAAGCACTG Freshwater (creek) "Allequash Creek, 6-7 cm depth"
+NP2 ILBC_19 ACAGTT AACTGT TCGCGCAACTG Ocean "Newport Pier, CA surface water, Time 1"
+NP3 ILBC_20 ACATCA TGATGT GCTAAGTGATG Ocean "Newport Pier, CA surface water, Time 2"
+NP5 ILBC_21 ACATGA TCATGT GAACGATCATG Ocean "Newport Pier, CA surface water, Time 3"
+TRRsed1 ILBC_22 ACATGT ACATGT CACGTGACATG Sediment (estuary) "Tijuana River Reserve, depth 1"
+TRRsed2 ILBC_23 ACATTC GAATGT TGCGCTGAATG Sediment (estuary) "Tijuana River Reserve, depth 2"
+TRRsed3 ILBC_24 ACCACA TGTGGT GATGTATGTGG Sediment (estuary) "Tijuana River Reserve, depth 2"
+TS28 ILBC_25 ACCAGA TCTGGT GCATCGTCTGG Feces Twin #1
+TS29 ILBC_26 ACCAGC GCTGGT CTAGTCGCTGG Feces Twin #2
+Even1 ILBC_27 ACCGCA TGCGGT TGACTCTGCGG Mock Even1
+Even2 ILBC_28 ACCTCG CGAGGT TCTGATCGAGG Mock Even2
+Even3 ILBC_29 ACCTGT ACAGGT AGAGAGACAGG Mock Even3
\ No newline at end of file
diff --git a/inst/extdata/usearch.uc b/inst/extdata/usearch.uc
new file mode 100644
index 0000000..10b41ed
--- /dev/null
+++ b/inst/extdata/usearch.uc
@@ -0,0 +1,100 @@
+H 320 211 99.5 - 0 0 257I211M971I D21.a.393001_3 FYIZYI001CNZEI orig_bc=ACATACGCGT new_bc=ACATACGCGT bc_diffs=0 4416570
+H 416 211 97.6 - 0 0 320I211M1022I D21.5n.392974_5 FYIZYI001CTYEF orig_bc=TCGTCGCTCG new_bc=TCGTCGCTCG bc_diffs=0 4480244
+H 424 211 99.1 - 0 0 247I81MI130M962I D11.b.392956_4 FYIZYI001EQ4DZ orig_bc=ATATCGCGAG new_bc=ATATCGCGAG bc_diffs=0 4483037
+H 354 211 98.6 - 0 0 295I211M889I D18.6m.393070_9 FYIZYI001CKKSP orig_bc=TGTACTACTC new_bc=TGTACTACTC bc_diffs=0 4449518
+H 420 211 97.2 - 0 0 286I172MD38M997I D20.393127_12 FYIZYI001CGO3M orig_bc=TAGAGACGAG new_bc=TAGAGACGAG bc_diffs=0 4481359
+H 428 211 98.6 - 0 0 254I211M944I D27.393075_13 FYIZYI001CQWMO orig_bc=CAGTAGACGT new_bc=CAGTAGACGT bc_diffs=0 4484111
+H 236 211 100.0 - 0 0 251I211M986I D13.5n.393036_15 FYIZYI001BLMDT orig_bc=CTCGCGTGTC new_bc=CTCGCGTGTC bc_diffs=0 4226619
+H 419 211 98.6 - 0 0 253I211M959I D2.393107_21 FYIZYI001CNI99 orig_bc=TACTCTCGTG new_bc=TACTCTCGTG bc_diffs=0 4481131
+H 297 211 100.0 - 0 0 283I211M933I D19.a.393146_19 FYIZYI001C0CZP orig_bc=CGTAGACTAG new_bc=CGTAGACTAG bc_diffs=0 4381553
+H 402 211 97.2 - 0 0 262I177MI34M965I D31.393093_22 FYIZYI001BNJ2B orig_bc=TACGCTGTCT new_bc=TACGCTGTCT bc_diffs=0 4475642
+H 296 211 99.1 - 0 0 240I211M970I D13.a.393151_27 FYIZYI001EYIF0 orig_bc=TAGTATCAGC new_bc=TAGTATCAGC bc_diffs=0 4381430
+H 354 211 98.1 - 0 0 294I9MI202M889I D10.b.393074_24 FYIZYI001BY1Z0 orig_bc=AGCACTGTAG new_bc=AGCACTGTAG bc_diffs=0 4449518
+N * * * . * * * D15.b.393025_8 FYIZYI001BLWUY orig_bc=CGAGAGATAC new_bc=CGAGAGATAC bc_diffs=0 *
+N * * * . * * * D19.b.393086_17 FYIZYI001BNM56 orig_bc=TACGAGTATG new_bc=TACGAGTATG bc_diffs=0 *
+H 417 211 99.1 - 0 0 262I160MI51M969I D19.a.393146_25 FYIZYI001EKPWO orig_bc=CGTAGACTAG new_bc=CGTAGACTAG bc_diffs=0 4480359
+N * * * . * * * D11.b.392956_23 FYIZYI001EQLMG orig_bc=ATATCGCGAG new_bc=ATATCGCGAG bc_diffs=0 *
+H 421 211 98.6 - 0 0 262I211M957I D10.5n.393082_28 FYIZYI001C0C4Q orig_bc=ACGCTCGACA new_bc=ACGCTCGACA bc_diffs=0 4481719
+H 377 211 99.5 - 0 0 257I211M977I D2.393107_30 FYIZYI001D191N orig_bc=TACTCTCGTG new_bc=TACTCTCGTG bc_diffs=0 4463709
+H 428 211 97.7 - 0 0 253I102MI30MI41MD37M944I D15.5n.393148_31 FYIZYI001CPTB3 orig_bc=TACTGAGCTA new_bc=TACTGAGCTA bc_diffs=0 4484111
+H 424 211 100.0 - 0 0 248I211M962I D16.b.393030_32 FYIZYI001ELLNF orig_bc=TCACGTACTA new_bc=TCACGTACTA bc_diffs=0 4483037
+H 388 211 99.5 - 0 0 281I211M944I D17.392970_33 FYIZYI001E1URB orig_bc=CGTCTAGTAC new_bc=CGTCTAGTAC bc_diffs=0 4468234
+H 419 211 98.6 - 0 0 253I211M959I D1.393095_34 FYIZYI001BP0UJ orig_bc=ACGAGTGCGT new_bc=ACGAGTGCGT bc_diffs=0 4481131
+H 419 211 97.6 - 0 0 255I68MD8MD133M959I D13.a.393151_35 FYIZYI001EP8YM orig_bc=TAGTATCAGC new_bc=TAGTATCAGC bc_diffs=0 4481131
+H 297 211 99.5 - 0 0 284I8MD202M933I D26.392975_47 FYIZYI001EYIA0 orig_bc=CACGCTACGT new_bc=CACGCTACGT bc_diffs=0 4381553
+H 419 211 98.6 - 0 0 255I77MD17MD115M959I D2.393107_40 FYIZYI001C79G7 orig_bc=TACTCTCGTG new_bc=TACTCTCGTG bc_diffs=0 4481131
+H 421 211 99.1 - 0 0 262I211M957I D23.392960_44 FYIZYI001C0F4N orig_bc=AGACTATACT new_bc=AGACTATACT bc_diffs=0 4481719
+N * * * . * * * D18.6m.393070_6 FYIZYI001BSDZT orig_bc=TGTACTACTC new_bc=TGTACTACTC bc_diffs=0 *
+H 388 211 100.0 - 0 0 281I211M944I D11.a.392963_48 FYIZYI001EVLXY orig_bc=ATCAGACACG new_bc=ATCAGACACG bc_diffs=0 4468234
+H 428 211 98.1 - 0 0 255I173MD37M944I D16.a.393131_38 FYIZYI001EF2LA orig_bc=ATACGACGTA new_bc=ATACGACGTA bc_diffs=0 4484111
+H 315 211 97.2 - 0 0 248I160MI51M986I D15.5n.393148_36 FYIZYI001CPH7M orig_bc=TACTGAGCTA new_bc=TACTGAGCTA bc_diffs=0 4412540
+H 419 211 98.6 - 0 0 253I211M959I D21.a.393001_54 FYIZYI001CFHC4 orig_bc=ACATACGCGT new_bc=ACATACGCGT bc_diffs=0 4481131
+H 180 211 97.6 - 0 0 226I75MD95MD39M952I D2.393107_49 FYIZYI001B14Y3 orig_bc=TACTCTCGTG new_bc=TACTCTCGTG bc_diffs=0 2232355
+H 354 211 98.6 - 0 0 295I211M889I D21.b.393031_50 FYIZYI001E0FZ4 orig_bc=ACGCGAGTAT new_bc=ACGCGAGTAT bc_diffs=0 4449518
+H 200 211 98.1 - 0 0 269I177MI34M954I D21.5n.392974_53 FYIZYI001ERQIR orig_bc=TCGTCGCTCG new_bc=TCGTCGCTCG bc_diffs=0 3154070
+H 337 211 98.6 - 0 0 244I51MD26MD19MI113M973I D24.393019_55 FYIZYI001CNZ0K orig_bc=AGTACGCTAT new_bc=AGTACGCTAT bc_diffs=0 4437368
+H 424 211 99.5 - 0 0 249I77MD133M962I D19.b.393086_52 FYIZYI001ES8M6 orig_bc=TACGAGTATG new_bc=TACGAGTATG bc_diffs=0 4483037
+N * * * . * * * D12.392988_51 FYIZYI001EYE44 orig_bc=CGTGTCTCTA new_bc=CGTGTCTCTA bc_diffs=0 *
+H 421 211 99.1 - 0 0 262I211M957I D21.a.393001_58 FYIZYI001B7A6L orig_bc=ACATACGCGT new_bc=ACATACGCGT bc_diffs=0 4481719
+H 350 211 98.1 - 0 0 269I177MI34M956I D15.b.393025_57 FYIZYI001EX2HO orig_bc=CGAGAGATAC new_bc=CGAGAGATAC bc_diffs=0 4447950
+H 272 211 99.1 - 0 0 281I211M877I D16.a.393131_63 FYIZYI001B319H orig_bc=ATACGACGTA new_bc=ATACGACGTA bc_diffs=0 619817
+H 52 211 99.1 - 0 0 261I211M1008I D11.a.392963_62 FYIZYI001D7ZJ9 orig_bc=ATCAGACACG new_bc=ATCAGACACG bc_diffs=0 269386
+H 350 211 97.6 - 0 0 269I177MI34M956I D17.392970_60 FYIZYI001CQCCL orig_bc=CGTCTAGTAC new_bc=CGTCTAGTAC bc_diffs=0 4447950
+H 419 211 98.6 - 0 0 253I211M959I D3.393129_66 FYIZYI001CGPI0 orig_bc=TACACGTGAT new_bc=TACACGTGAT bc_diffs=0 4481131
+N * * * . * * * D21.5n.392974_43 FYIZYI001B1B5X orig_bc=TCGTCGCTCG new_bc=TCGTCGCTCG bc_diffs=0 *
+H 419 211 98.6 - 0 0 253I211M959I D22.5n.393054_72 FYIZYI001B41NB orig_bc=ACTGTACAGT new_bc=ACTGTACAGT bc_diffs=0 4481131
+H 121 211 99.1 - 0 0 262I90MD86MI34M895I D21.a.393001_64 FYIZYI001C6RLV orig_bc=ACATACGCGT new_bc=ACATACGCGT bc_diffs=0 174337
+H 121 211 99.1 - 0 0 262I90MD86MI34M895I D22.393118_67 FYIZYI001BV2N4 orig_bc=ACTACTATGT new_bc=ACTACTATGT bc_diffs=0 174337
+H 67 211 98.6 - 0 0 293I211M992I D18.6m.393070_70 FYIZYI001EWXE9 orig_bc=TGTACTACTC new_bc=TGTACTACTC bc_diffs=0 215097
+H 276 211 99.5 - 0 0 282I143MI68M933I D22.393118_59 FYIZYI001CVL1U orig_bc=ACTACTATGT new_bc=ACTACTATGT bc_diffs=0 4358723
+H 118 211 99.5 - 0 0 284I211M894I D20.393127_79 FYIZYI001CMKWC orig_bc=TAGAGACGAG new_bc=TAGAGACGAG bc_diffs=0 175279
+H 121 211 99.5 - 0 0 261I177MI34M895I D2.393107_74 FYIZYI001E0128 orig_bc=TACTCTCGTG new_bc=TACTCTCGTG bc_diffs=0 174337
+H 424 211 100.0 - 0 0 248I211M962I D19.a.393146_78 FYIZYI001ECT9S orig_bc=CGTAGACTAG new_bc=CGTAGACTAG bc_diffs=0 4483037
+H 350 211 98.1 - 0 0 269I177MI34M956I D13.a.393151_73 FYIZYI001BPHT0 orig_bc=TAGTATCAGC new_bc=TAGTATCAGC bc_diffs=0 4447950
+H 350 211 97.7 - 0 0 268I144MI26MI41M956I D13.b.393109_77 FYIZYI001CIMAS orig_bc=TCTCTATGCG new_bc=TCTCTATGCG bc_diffs=0 4447950
+H 307 211 98.6 - 0 0 276I10MD166MI34M954I D15.a.392972_76 FYIZYI001CXV9P orig_bc=CATAGTAGTG new_bc=CATAGTAGTG bc_diffs=0 4401375
+H 379 211 100.0 - 0 0 256I211M896I D1.393095_84 FYIZYI001CU1AD orig_bc=ACGAGTGCGT new_bc=ACGAGTGCGT bc_diffs=0 4465746
+H 388 211 100.0 - 0 0 281I211M944I D21.5n.392974_80 FYIZYI001EX5NI orig_bc=TCGTCGCTCG new_bc=TCGTCGCTCG bc_diffs=0 4468234
+H 352 211 97.6 - 0 0 265I211M962I D29.393071_87 FYIZYI001CNMM0 orig_bc=TACACACACT new_bc=TACACACACT bc_diffs=0 4448492
+H 421 211 98.6 - 0 0 262I211M957I D14.393072_81 FYIZYI001B1IQ2 orig_bc=TGATACGTCT new_bc=TGATACGTCT bc_diffs=0 4481719
+H 354 211 98.1 - 0 0 294I9MI202M889I D22.393118_83 FYIZYI001B40WY orig_bc=ACTACTATGT new_bc=ACTACTATGT bc_diffs=0 4449518
+H 121 211 99.5 - 0 0 261I177MI34M895I D27.393075_82 FYIZYI001CEE2P orig_bc=CAGTAGACGT new_bc=CAGTAGACGT bc_diffs=0 174337
+H 121 211 99.1 - 0 0 261I177MI34M895I D26.392975_89 FYIZYI001EWXH6 orig_bc=CACGCTACGT new_bc=CACGCTACGT bc_diffs=0 174337
+H 350 211 98.1 - 0 0 269I177MI34M956I D10.5n.393082_86 FYIZYI001EZG8Q orig_bc=ACGCTCGACA new_bc=ACGCTCGACA bc_diffs=0 4447950
+H 297 211 97.2 - 0 0 280I12MI132MI42MI25M933I D10.b.393074_85 FYIZYI001CW3N2 orig_bc=AGCACTGTAG new_bc=AGCACTGTAG bc_diffs=0 4381553
+H 388 211 100.0 - 0 0 281I211M944I D20.393127_93 FYIZYI001B06HR orig_bc=TAGAGACGAG new_bc=TAGAGACGAG bc_diffs=0 4468234
+H 67 211 99.1 - 0 0 293I211M992I D26.392975_91 FYIZYI001E2TAL orig_bc=CACGCTACGT new_bc=CACGCTACGT bc_diffs=0 215097
+H 421 211 98.6 - 0 0 262I211M957I D16.b.393030_90 FYIZYI001DLIP0 orig_bc=TTACGTACTA new_bc=TCACGTACTA bc_diffs=1 4481719
+H 388 211 100.0 - 0 0 281I211M944I D19.b.393086_98 FYIZYI001B020X orig_bc=TACGAGTATG new_bc=TACGAGTATG bc_diffs=0 4468234
+H 379 211 99.5 - 0 0 255I201MI10M896I D12.392988_99 FYIZYI001EY4UF orig_bc=CGTGTCTCTA new_bc=CGTGTCTCTA bc_diffs=0 4465746
+H 246 211 97.6 - 0 0 273I77MD133M1000I D19.a.393146_95 FYIZYI001B0J90 orig_bc=CGTAGACTAG new_bc=CGTAGACTAG bc_diffs=0 4308637
+H 354 211 98.6 - 0 0 295I211M889I D19.a.393146_97 FYIZYI001CTYTS orig_bc=CGTAGACTAG new_bc=CGTAGACTAG bc_diffs=0 4449518
+H 350 211 98.1 - 0 0 269I177MI34M956I D22.5n.393054_96 FYIZYI001EPS5M orig_bc=ACTGTACAGT new_bc=ACTGTACAGT bc_diffs=0 4447950
+H 428 211 97.2 - 0 0 255I173MD37M944I D13.5n.393036_100 FYIZYI001E1HSI orig_bc=CTCGCGTGTC new_bc=CTCGCGTGTC bc_diffs=0 4484111
+H 419 211 98.6 - 0 0 253I211M959I D30.392996_94 FYIZYI001CZLJH orig_bc=TACAGATCGT new_bc=TACAGATCGT bc_diffs=0 4481131
+H 421 211 97.6 - 0 0 264I7MD168MD34M957I D14.393072_103 FYIZYI001CUDP6 orig_bc=TGATACGTCT new_bc=TGATACGTCT bc_diffs=0 4481719
+H 354 211 98.6 - 0 0 295I211M889I D13.5n.393036_108 FYIZYI001CEUZ0 orig_bc=CTCGCGTGTC new_bc=CTCGCGTGTC bc_diffs=0 4449518
+H 121 211 99.1 - 0 0 262I90MD86MI34M895I D31.393093_109 FYIZYI001B0AY7 orig_bc=TACGCTGTCT new_bc=TACGCTGTCT bc_diffs=0 174337
+H 322 211 99.5 - 0 0 218I176MI35M950I D19.a.393146_106 FYIZYI001EC6BS orig_bc=CGTAGACTAG new_bc=CGTAGACTAG bc_diffs=0 4416951
+H 388 211 99.1 - 0 0 279I144MI42MI25M944I D18.6m.393070_102 FYIZYI001CZX37 orig_bc=TGTACTACTC new_bc=TGTACTACTC bc_diffs=0 4468234
+H 421 211 99.1 - 0 0 262I211M957I D22.5n.393054_111 FYIZYI001EY7NF orig_bc=ACTGTACAGT new_bc=ACTGTACAGT bc_diffs=0 4481719
+H 419 211 98.6 - 0 0 253I211M959I D13.b.393109_110 FYIZYI001CQ5AR orig_bc=TCTCTATGCG new_bc=TCTCTATGCG bc_diffs=0 4481131
+H 417 211 99.1 - 0 0 261I142MI15MI54M969I D10.5n.393082_113 FYIZYI001CQG96 orig_bc=ACGCTCGACA new_bc=ACGCTCGACA bc_diffs=0 4480359
+H 348 211 99.1 - 0 0 267I144MI30MI37M962I D14.393072_112 FYIZYI001AZNVD orig_bc=TGATACGTCT new_bc=TGATACGTCT bc_diffs=0 4446898
+H 388 211 100.0 - 0 0 281I211M944I D21.5n.392974_116 FYIZYI001EZT0I orig_bc=TCGTCGCTCG new_bc=TCGTCGCTCG bc_diffs=0 4468234
+N * * * . * * * D13.5n.393036_101 FYIZYI001AJUVV orig_bc=CTCGCGTGTC new_bc=CTCGCGTGTC bc_diffs=0 *
+H 421 211 98.6 - 0 0 261I143MI68M957I D14.393072_114 FYIZYI001BZEMP orig_bc=TGATACGTCT new_bc=TGATACGTCT bc_diffs=0 4481719
+H 297 211 100.0 - 0 0 283I211M933I D10.a.393084_120 FYIZYI001BWLI0 orig_bc=AGACGCACTC new_bc=AGACGCACTC bc_diffs=0 4381553
+H 388 211 100.0 - 0 0 281I211M944I D11.a.392963_122 FYIZYI001CPM06 orig_bc=ATCAGACACG new_bc=ATCAGACACG bc_diffs=0 4468234
+H 420 211 97.2 - 0 0 286I172MD38M997I D13.b.393109_119 FYIZYI001EFI76 orig_bc=TCTCTATGCG new_bc=TCTCTATGCG bc_diffs=0 4481359
+H 419 211 98.6 - 0 0 253I211M959I D1.393095_127 FYIZYI001C7QWM orig_bc=ACGAGTGCGT new_bc=ACGAGTGCGT bc_diffs=0 4481131
+H 421 211 99.1 - 0 0 262I211M957I D28.393022_124 FYIZYI001D0O24 orig_bc=CGACGTGACT new_bc=CGACGTGACT bc_diffs=0 4481719
+H 388 211 100.0 - 0 0 281I211M944I D11.a.392963_126 FYIZYI001C17CY orig_bc=ATCAGACACG new_bc=ATCAGACACG bc_diffs=0 4468234
+H 421 211 99.1 - 0 0 262I211M957I D24.393019_128 FYIZYI001CW7C8 orig_bc=AGTACGCTAT new_bc=AGTACGCTAT bc_diffs=0 4481719
+H 261 211 99.1 - 0 0 261I211M981I D21.a.393001_132 FYIZYI001DWUG5 orig_bc=ACATACGCGT new_bc=ACATACGCGT bc_diffs=0 4331364
+H 388 211 100.0 - 0 0 281I211M944I D3.393129_130 FYIZYI001EX2KV orig_bc=TACACGTGAT new_bc=TACACGTGAT bc_diffs=0 4468234
+H 320 211 97.6 - 0 0 257I211M971I D21.5n.392974_134 FYIZYI001DFZEH orig_bc=TCGTCGCTCG new_bc=TCGTCGCTCG bc_diffs=0 4416570
+H 276 211 99.5 - 0 0 282I143MI68M933I D30.392996_125 FYIZYI001CAW20 orig_bc=TACAGATCGT new_bc=TACAGATCGT bc_diffs=0 4358723
+H 350 211 98.1 - 0 0 269I177MI34M956I D13.b.393109_131 FYIZYI001EDVLU orig_bc=TCTCTATGCG new_bc=TCTCTATGCG bc_diffs=0 4447950
+H 354 211 98.6 - 0 0 295I211M889I D19.5n.393000_136 FYIZYI001COL2A orig_bc=ACGACTACAG new_bc=ACGACTACAG bc_diffs=0 4449518
+H 388 211 100.0 - 0 0 281I211M944I D28.393022_140 FYIZYI001CXW17 orig_bc=CGACGTGACT new_bc=CGACGTGACT bc_diffs=0 4468234
diff --git a/inst/scripts/installer.R b/inst/scripts/installer.R
new file mode 100644
index 0000000..e416a5c
--- /dev/null
+++ b/inst/scripts/installer.R
@@ -0,0 +1,49 @@
+##############################
+# An example function for installing phyloseq from various sources
+##############################
+install_phyloseq = function(branch = "release",
+ minRVersion = "3.3.0",
+ verbose = TRUE){
+ if(!compareVersion(as.character(getRversion()), minRVersion) >=0){
+ stop("phyloseq installation script failed.\n",
+ "R ", minRVersion, " or greater is required.")
+ }
+ branch <- as.character(branch)
+ if(branch == "release"){
+ if(verbose){
+ message("Installing the release version from BioC")
+ }
+ source("http://bioconductor.org/biocLite.R")
+ biocLite("phyloseq", suppressUpdates=TRUE)
+ return("phyloseq installed from BioC release branch (if no errors).")
+ }
+ if(branch == "devel"){
+ if(verbose){
+ message("\n\nInstalling phyloseq from the devel version from BioC...\n")
+ }
+ biocLite("phyloseq",
+ siteRepos="http://bioconductor.org/packages/devel/bioc",
+ suppressUpdates=TRUE,
+ type="source")
+ return("phyloseq installed from BioC devel branch (if no errors).")
+ }
+ if(branch == "github"){
+ if(verbose){
+ message("Installing the devel version from joey711/master from GitHub")
+ }
+ if(!require("devtools", quietly=TRUE)){
+ # Note: needs Curl for RCurl
+ install.packages("devtools")
+ }
+ library("devtools")
+ devtools::install_github("joey711/phyloseq")
+ return("phyloseq installed from GitHub `joey711/phyloseq` (if no errors).")
+ }
+ return("You probably selected an unsupported argument to `branch`.
+ Try again using 'release', 'devel', or 'github'.")
+}
+###############
+# Execute the function w/ default params.
+# You can select alternatives if you want :-)
+###############
+install_phyloseq()
diff --git a/man/DPCoA.Rd b/man/DPCoA.Rd
new file mode 100644
index 0000000..405e47f
--- /dev/null
+++ b/man/DPCoA.Rd
@@ -0,0 +1,91 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ordination-methods.R
+\name{DPCoA}
+\alias{DPCoA}
+\title{Calculate Double Principle Coordinate Analysis (DPCoA)
+using phylogenetic distance}
+\usage{
+DPCoA(physeq, correction = cailliez, scannf = FALSE, ...)
+}
+\arguments{
+\item{physeq}{(Required). A \code{\link{phyloseq-class}} object
+containing, at a minimum, abundance (\code{\link{otu_table-class}}) and
+phylogenetic (\code{\link[ape]{phylo}}) components.
+As a test, the accessors \code{\link{otu_table}} and \code{\link{phy_tree}}
+should return an object without error.}
+
+\item{correction}{(Optional). A function. The function must be
+ able to take a non-Euclidean \code{\link{dist}}ance object,
+ and return a new \code{dist}ance object that is Euclidean.
+ If testing a distance object, try \code{\link[ade4]{is.euclid}}.
+
+ In most real-life, real-data applications, the phylogenetic tree
+ will not provide a Euclidean distance matrix, and so a correction
+ will be needed.
+ Two recommended correction methods are
+ \code{\link[ade4]{cailliez}} and \code{\link[ade4]{lingoes}}.
+ The default is \code{cailliez},
+ but not for any particularly special reason. If the patristic
+ distance matrix turns out to be Euclidian, no correction will be
+ performed, regardless of the value of the \code{correction} argument.}
+
+\item{scannf}{(Optional). Logical. Default is \code{FALSE}. This
+is passed directly to \code{\link[ade4]{dpcoa}}, and causes a
+barplot of eigenvalues to be created if \code{TRUE}. This is not
+included in \code{...} because the default for \code{\link[ade4]{dpcoa}}
+is \code{TRUE}, although in many expected situations we would want
+to suppress creating the barplot.}
+
+\item{...}{Additional arguments passed to \code{\link[ade4]{dpcoa}}.}
+}
+\value{
+A \code{dpcoa}-class object (see \code{\link[ade4]{dpcoa}}).
+}
+\description{
+Function uses abundance (\code{\link{otu_table-class}}) and
+phylogenetic (\code{\link[ape]{phylo}}) components of a
+\code{\link{phyloseq-class}} experiment-level object
+to perform a
+Double Principle Coordinate Analysis (DPCoA), relying heavily on
+the underlying (and more general) function, \code{\link[ade4]{dpcoa}}.
+The distance object ultimately provided as the cophenetic/patristic
+(\code{\link[ape]{cophenetic.phylo}}) distance between the species.
+}
+\details{
+In most real-life, real-data applications, the phylogenetic tree
+will not provide a Euclidean distance matrix, and so a correction
+will be performed, if needed. See \code{correction} argument.
+}
+\examples{
+# # # # # # Esophagus
+data(esophagus)
+eso.dpcoa <- DPCoA(esophagus)
+eso.dpcoa
+plot_ordination(esophagus, eso.dpcoa, "samples")
+plot_ordination(esophagus, eso.dpcoa, "species")
+plot_ordination(esophagus, eso.dpcoa, "biplot")
+#
+#
+# # # # # # GlobalPatterns
+data(GlobalPatterns)
+# subset GP to top-150 taxa (to save computation time in example)
+keepTaxa <- names(sort(taxa_sums(GlobalPatterns), TRUE)[1:150])
+GP <- prune_taxa(keepTaxa, GlobalPatterns)
+# Perform DPCoA
+GP.dpcoa <- DPCoA(GP)
+plot_ordination(GP, GP.dpcoa, color="SampleType")
+}
+\author{
+Julia Fukuyama \email{julia.fukuyama at gmail.com}.
+ Adapted for phyloseq by Paul J. McMurdie.
+}
+\references{
+Pavoine, S., Dufour, A.B. and Chessel, D. (2004)
+From dissimilarities among species to dissimilarities among communities:
+a double principal coordinate analysis.
+Journal of Theoretical Biology, 228, 523-537.
+}
+\seealso{
+\code{\link[ade4]{dpcoa}}
+}
+
diff --git a/man/JSD.Rd b/man/JSD.Rd
new file mode 100644
index 0000000..47020a1
--- /dev/null
+++ b/man/JSD.Rd
@@ -0,0 +1,60 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/distance-methods.R
+\name{JSD}
+\alias{JSD}
+\title{Calculate the Jensen-Shannon Divergence (distance)}
+\usage{
+JSD(physeq)
+}
+\arguments{
+\item{physeq}{(Required). \code{\link{phyloseq-class}}.
+The phyloseq data on which to compute the
+pairwise sample distance matrix.}
+}
+\value{
+An object of class ``\code{\link{dist}}'' suitable for certain
+ ordination methods and other distance-based analyses.
+ See \code{\link{distance}}.
+}
+\description{
+This is a phyloseq-specific implementation of the Jensen-Shannon Divergence
+for comparing pairs of microbial communities (samples) in an experiment.
+The expectation is that you have many samples (say. more than two) and you
+want a distance matrix on which will perform further analysis. \code{JSD} is
+intended to be ``wrapped'' by the more general \code{\link{distance}}
+function in phyloseq, and it can be invoked using \code{"jsd"} as the
+argument to the \code{method} parameter of \code{\link{distance}}.
+}
+\details{
+One of the motivations for providing JSD in phyloseq was its recent use in
+the analysis of the \code{\link{enterotype}} dataset.
+}
+\examples{
+# library(doParallel) # Do this and next line only if you have multi-cores
+# registerDoParallel(cores=6)
+# data(enterotype)
+# # ent.jsd <- JSD(enterotype, TRUE) # internal only
+# ent.jsd <- distance(enterotype, "jsd", parallel=TRUE)
+# ent.PCoA <- ordinate(enterotype, "PCoA", ent.jsd) # Perform principle coordinate analysis
+# p <- plot_ordination(enterotype, ent.PCoA, color="Enterotype", shape="SeqTech")
+# (p <- p + geom_point(size=5, alpha=0.5))
+}
+\author{
+Susan Holmes \email{susan at stat.stanford.edu}.
+ Adapted for phyloseq by Paul J. McMurdie.
+}
+\references{
+Jensen-Shannon Divergence and Hilbert space embedding.
+Bent Fuglede and Flemming Topsoe University of Copenhagen,
+Department of Mathematics
+\url{http://www.math.ku.dk/~topsoe/ISIT2004JSD.pdf}
+}
+\seealso{
+\code{\link{distance}}
+
+ \code{\link{enterotype}}
+
+ \url{http://en.wikipedia.org/wiki/Jensen-Shannon_divergence}
+}
+\keyword{internal}
+
diff --git a/man/UniFrac-methods.Rd b/man/UniFrac-methods.Rd
new file mode 100644
index 0000000..209d12c
--- /dev/null
+++ b/man/UniFrac-methods.Rd
@@ -0,0 +1,174 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/distance-methods.R
+\docType{methods}
+\name{UniFrac}
+\alias{UniFrac}
+\alias{UniFrac,phyloseq-method}
+\title{Calculate weighted or unweighted (Fast) UniFrac distance for all sample pairs.}
+\usage{
+UniFrac(physeq, weighted=FALSE, normalized=TRUE, parallel=FALSE, fast=TRUE)
+
+\S4method{UniFrac}{phyloseq}(physeq, weighted = FALSE, normalized = TRUE,
+ parallel = FALSE, fast = TRUE)
+}
+\arguments{
+\item{physeq}{(Required). \code{\link{phyloseq-class}}, containing at minimum
+a phylogenetic tree (\code{\link{phylo-class}}) and
+contingency table (\code{\link{otu_table-class}}). See
+examples below for coercions that might be necessary.}
+
+\item{weighted}{(Optional). Logical. Should use weighted-UniFrac calculation?
+Weighted-UniFrac takes into account the relative abundance of species/taxa
+shared between samples, whereas unweighted-UniFrac only considers
+presence/absence. Default is \code{FALSE}, meaning the unweighted-UniFrac
+distance is calculated for all pairs of samples.}
+
+\item{normalized}{(Optional). Logical. Should the output be normalized such that values
+range from 0 to 1 independent of branch length values? Default is \code{TRUE}.
+Note that (unweighted) \code{UniFrac} is always normalized by total branch-length,
+and so this value is ignored when \code{weighted == FALSE}.}
+
+\item{parallel}{(Optional). Logical. Should execute calculation in parallel,
+using multiple CPU cores simultaneously? This can dramatically hasten the
+computation time for this function. However, it also requires that the user
+has registered a parallel ``backend'' prior to calling this function.
+Default is \code{FALSE}. If FALSE, UniFrac will register a serial backend
+so that \code{foreach::\%dopar\%} does not throw a warning.}
+
+\item{fast}{(Optional). Logical. DEPRECATED.
+Do you want to use the ``Fast UniFrac''
+algorithm? Implemented natively in the \code{phyloseq-package}.
+\code{TRUE} is now the only supported option.
+There should be no difference in the output between the two algorithms.
+Moreover, the original UniFrac algorithm
+only outperforms this implementation of fast-UniFrac if the datasets are so
+small
+(approximated by the value of \code{ntaxa(physeq) * nsamples(physeq)})
+that the difference in time is inconsequential (less than 1 second).
+In practice it does not appear that this parameter should
+have ever been set to \code{FALSE}, and therefore
+the original UniFrac implementation perhaps never should have been supported here.
+For legacy code support the option is now deprecated here
+(the implementation was an internal function, anyway)
+and the \code{fast} option will remain for one release cycle before
+being removed completely
+in order to avoid causing unsupported-argument errors.}
+}
+\value{
+a sample-by-sample distance matrix, suitable for NMDS, etc.
+}
+\description{
+This function calculates the (Fast) UniFrac distance for all sample-pairs
+in a \code{\link{phyloseq-class}} object.
+}
+\details{
+\code{UniFrac()} accesses the abundance
+(\code{\link{otu_table-class}}) and a phylogenetic tree (\code{\link{phylo-class}})
+data within an experiment-level (\code{\link{phyloseq-class}}) object.
+If the tree and contingency table are separate objects, suggested solution
+is to combine them into an experiment-level class
+using the \code{\link{phyloseq}} function. For example, the following code
+
+\code{phyloseq(myotu_table, myTree)}
+
+returns a \code{phyloseq}-class object that has been pruned and comprises
+the minimum arguments necessary for \code{UniFrac()}.
+
+Parallelization is possible for UniFrac calculated with the \code{\link{phyloseq-package}},
+and is encouraged in the instances of large trees, many samples, or both.
+Parallelization has been implemented via the \code{\link{foreach-package}}.
+This means that parallel calls need to be preceded by 2 or more commands
+that register the parallel ``backend''. This is acheived via your choice of
+helper packages. One of the simplest seems to be the \emph{doParallel} package.
+
+For more information, see the following links on registering the ``backend'':
+
+\emph{foreach} package manual:
+
+\url{http://cran.r-project.org/web/packages/foreach/index.html}
+
+Notes on parallel computing in \code{R}. Skip to the section describing
+the \emph{foreach Framework}. It gives off-the-shelf examples for registering
+a parallel backend using the \emph{doMC}, \emph{doSNOW}, or \emph{doMPI} packages:
+
+\url{http://trg.apbionet.org/euasiagrid/docs/parallelR.notes.pdf}
+
+Furthermore, as of \code{R} version \code{2.14.0} and higher, a parallel package
+is included as part of the core installation, \code{\link{parallel-package}},
+and this can be used as the parallel backend with the \code{\link{foreach-package}}
+using the adaptor package ``doParallel''.
+\url{http://cran.r-project.org/web/packages/doParallel/index.html}
+
+See the vignette for some simple examples for using doParallel.
+\url{http://cran.r-project.org/web/packages/doParallel/vignettes/gettingstartedParallel.pdf}
+
+UniFrac-specific examples for doParallel are provided in the example
+code below.
+}
+\examples{
+################################################################################
+# Perform UniFrac on esophagus data
+################################################################################
+data("esophagus")
+(y <- UniFrac(esophagus, TRUE))
+UniFrac(esophagus, TRUE, FALSE)
+UniFrac(esophagus, FALSE)
+# ################################################################################
+# # Now try a parallel implementation using doParallel, which leverages the
+# # new 'parallel' core package in R 2.14.0+
+# # Note that simply loading the 'doParallel' package is not enough, you must
+# # call a function that registers the backend. In general, this is pretty easy
+# # with the 'doParallel package' (or one of the alternative 'do*' packages)
+# #
+# # Also note that the esophagus example has only 3 samples, and a relatively small
+# # tree. This is fast to calculate even sequentially and does not warrant
+# # parallelized computation, but provides a good quick example for using UniFrac()
+# # in a parallel fashion. The number of cores you should specify during the
+# # backend registration, using registerDoParallel(), depends on your system and
+# # needs. 3 is chosen here for convenience. If your system has only 2 cores, this
+# # will probably fault or run slower than necessary.
+# ################################################################################
+# library(doParallel)
+# data(esophagus)
+# # For SNOW-like functionality (works on Windows):
+# cl <- makeCluster(3)
+# registerDoParallel(cl)
+# UniFrac(esophagus, TRUE)
+# # Force to sequential backed:
+# registerDoSEQ()
+# # For multicore-like functionality (will probably not work on windows),
+# # register the backend like this:
+# registerDoParallel(cores=3)
+# UniFrac(esophagus, TRUE)
+################################################################################
+}
+\references{
+\url{http://bmf.colorado.edu/unifrac/}
+
+The main implementation (Fast UniFrac) is adapted from the algorithm's
+description in:
+
+Hamady, Lozupone, and Knight,
+``\href{http://www.nature.com/ismej/journal/v4/n1/full/ismej200997a.html}{Fast UniFrac:}
+facilitating high-throughput phylogenetic analyses of
+microbial communities including analysis of pyrosequencing and PhyloChip data.''
+The ISME Journal (2010) 4, 17--27.
+
+See also additional descriptions of UniFrac in the following articles:
+
+Lozupone, Hamady and Knight, ``UniFrac - An Online Tool for Comparing Microbial
+Community Diversity in a Phylogenetic Context.'', BMC Bioinformatics 2006, 7:371
+
+Lozupone, Hamady, Kelley and Knight, ``Quantitative and qualitative (beta)
+diversity measures lead to different insights into factors that structure
+microbial communities.'' Appl Environ Microbiol. 2007
+
+Lozupone C, Knight R. ``UniFrac: a new phylogenetic method for comparing microbial
+communities.'' Appl Environ Microbiol. 2005 71 (12):8228-35.
+}
+\seealso{
+\code{\link{distance}}
+
+\code{unifrac} in the picante package.
+}
+
diff --git a/man/access.Rd b/man/access.Rd
new file mode 100644
index 0000000..9bb45f4
--- /dev/null
+++ b/man/access.Rd
@@ -0,0 +1,49 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/phyloseq-class.R
+\name{access}
+\alias{access}
+\title{Universal slot accessor function for phyloseq-class.}
+\usage{
+access(physeq, slot, errorIfNULL=FALSE)
+}
+\arguments{
+\item{physeq}{(Required). \code{\link{phyloseq-class}}.}
+
+\item{slot}{(Required). A character string indicating the slot (not data class)
+of the component data type that is desired.}
+
+\item{errorIfNULL}{(Optional). Logical. Should the accessor stop with
+an error if the slot is empty (\code{NULL})? Default \code{FALSE}.}
+}
+\value{
+Returns the component object specified by the argument \code{slot}.
+ Returns NULL if slot does not exist. Returns \code{physeq} as-is
+ if it is a component class that already matches the slot name.
+}
+\description{
+This function is used internally by many accessors and in
+many functions/methods that need to access a particular type of component data.
+If something is wrong, or the slot is missing, the expected behavior is that
+this function will return NULL. Thus, the output can be tested by
+\code{\link{is.null}} as verification of the presence of a particular
+data component. Unlike the component-specific accessors (e.g. \code{\link{otu_table}},
+or \code{\link{phy_tree}}),
+the default behavior is not to stop with an error if the desired slot is empty.
+In all cases this is controlled by the \code{errorIfNULL} argument, which can
+be set to \code{TRUE} if an error is desired.
+}
+\examples{
+#
+## data(GlobalPatterns)
+## access(GlobalPatterns, "tax_table")
+## access(GlobalPatterns, "phy_tree")
+## access(otu_table(GlobalPatterns), "otu_table")
+## # Should return NULL:
+## access(otu_table(GlobalPatterns), "sample_data")
+## access(otuTree(GlobalPatterns), "sample_data")
+## access(otuSam(GlobalPatterns), "phy_tree")
+}
+\seealso{
+\code{\link{getslots.phyloseq}}, \code{\link{merge_phyloseq}}
+}
+
diff --git a/man/assign-otu_table.Rd b/man/assign-otu_table.Rd
new file mode 100644
index 0000000..9f94c04
--- /dev/null
+++ b/man/assign-otu_table.Rd
@@ -0,0 +1,46 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/assignment-methods.R
+\docType{methods}
+\name{otu_table<-}
+\alias{assign-otu_table}
+\alias{otu_table<-}
+\alias{otu_table<-,otu_table,otu_table-method}
+\alias{otu_table<-,phyloseq,otu_table-method}
+\alias{otu_table<-,phyloseq,phyloseq-method}
+\title{Assign a new OTU Table to \code{x}}
+\usage{
+otu_table(x) <- value
+
+\S4method{otu_table}{phyloseq,otu_table}(x) <- value
+
+\S4method{otu_table}{otu_table,otu_table}(x) <- value
+
+\S4method{otu_table}{phyloseq,phyloseq}(x) <- value
+}
+\arguments{
+\item{x}{(Required). \code{\link{phyloseq-class}}}
+
+\item{value}{(Required).
+\code{\link{otu_table-class}}
+or
+\code{\link{phyloseq-class}}.}
+}
+\description{
+Assign a new OTU Table to \code{x}
+}
+\examples{
+# data(GlobalPatterns)
+# # An example of pruning to just the first 100 taxa in GlobalPatterns.
+# ex2a <- prune_taxa(taxa_names(GlobalPatterns)[1:100], GlobalPatterns)
+# # The following 3 lines produces an ex2b that is equal to ex2a
+# ex2b <- GlobalPatterns
+# OTU <- otu_table(GlobalPatterns)[1:100, ]
+# otu_table(ex2b) <- OTU
+# identical(ex2a, ex2b)
+# print(ex2b)
+# # Relace otu_table by implying the component in context.
+# ex2c <- GlobalPatterns
+# otu_table(ex2c) <- ex2b
+# identical(ex2a, ex2c)
+}
+
diff --git a/man/assign-phy_tree.Rd b/man/assign-phy_tree.Rd
new file mode 100644
index 0000000..4f6db6f
--- /dev/null
+++ b/man/assign-phy_tree.Rd
@@ -0,0 +1,35 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/assignment-methods.R
+\docType{methods}
+\name{phy_tree<-}
+\alias{assign-phy_tree}
+\alias{phy_tree<-}
+\alias{phy_tree<-,phyloseq,phylo-method}
+\alias{phy_tree<-,phyloseq,phyloseq-method}
+\title{Assign a (new) phylogenetic tree to \code{x}}
+\usage{
+phy_tree(x) <- value
+
+\S4method{phy_tree}{phyloseq,phylo}(x) <- value
+
+\S4method{phy_tree}{phyloseq,phyloseq}(x) <- value
+}
+\arguments{
+\item{x}{(Required). \code{\link{phyloseq-class}}}
+
+\item{value}{(Required). \code{\link{phylo-class}}, or \code{\link{phyloseq-class}}}
+}
+\description{
+Assign a (new) phylogenetic tree to \code{x}
+}
+\examples{
+#
+data("esophagus")
+# An example of pruning to just the first 20 taxa in esophagus
+ex2a <- prune_taxa(taxa_names(esophagus)[1:20], esophagus)
+# The following 3 lines produces an ex2b that is equal to ex2a
+ex2b <- ex2a
+phy_tree(ex2b) <- phy_tree(esophagus)
+identical(ex2a, ex2b)
+}
+
diff --git a/man/assign-sample_data.Rd b/man/assign-sample_data.Rd
new file mode 100644
index 0000000..346a8f1
--- /dev/null
+++ b/man/assign-sample_data.Rd
@@ -0,0 +1,48 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/assignment-methods.R
+\name{sample_data<-}
+\alias{assign-sample_data}
+\alias{sample_data<-}
+\title{Assign (new) sample_data to \code{x}}
+\usage{
+sample_data(x) <- value
+}
+\arguments{
+\item{x}{(Required). \code{\link{phyloseq-class}}. The object to modify.}
+
+\item{value}{(Required). Either a \code{\link{sample_data-class}},
+a \code{data.frame} that can be coerced into \code{\link{sample_data-class}},
+or a \code{\link{phyloseq-class}} that contains a
+suitable \code{sample_data} component to assign to \code{x}. If unsure,
+try \code{\link{sample_data}}\code{(value)}, which should return a
+\code{\link{sample_data-class}} object without error.}
+}
+\value{
+No return. This is an assignment statement.
+}
+\description{
+This replaces the current \code{sample_data} component of \code{x} with
+\code{value}, if \code{value} is a \code{\link{sample_data-class}}. However,
+if \code{value} is a \code{data.frame}, then \code{value} is first coerced to
+a \code{\link{sample_data-class}}, and then assigned. Alternatively, if
+\code{value} is \code{\link{phyloseq-class}}, then the
+\code{\link{sample_data}} component will first be accessed from \code{value}
+ and then assigned. This makes possible some concise assignment/replacement
+ statements when adjusting, modifying, or building subsets of
+ experiment-level data. See some examples below.
+
+Internally, this re-builds the \code{\link{phyloseq-class}} object using
+the standard \code{\link{phyloseq}} constructor. Thus, index mismatches
+between sample-describing components will not be allowed, and subsetting
+will occurr automatically such that only the intersection of sample IDs
+are included in any components. This has the added benefit of re-checking
+(internally) for any other issues.
+}
+\examples{
+ data(soilrep)
+ soilrep
+ head(sample_data(soilrep))
+ sample_data(soilrep)$Time <- as.integer(substr(sample_data(soilrep)$Sample, 1, 1))
+ head(sample_data(soilrep))
+}
+
diff --git a/man/assign-sample_names.Rd b/man/assign-sample_names.Rd
new file mode 100644
index 0000000..cf0ca7b
--- /dev/null
+++ b/man/assign-sample_names.Rd
@@ -0,0 +1,51 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/assignment-methods.R
+\docType{methods}
+\name{sample_names<-}
+\alias{assign-sample_names}
+\alias{sample_names<-}
+\alias{sample_names<-,ANY,ANY-method}
+\alias{sample_names<-,ANY,character-method}
+\alias{sample_names<-,otu_table,character-method}
+\alias{sample_names<-,phyloseq,character-method}
+\alias{sample_names<-,sample_data,character-method}
+\title{Replace OTU identifier names}
+\usage{
+sample_names(x) <- value
+
+\S4method{sample_names}{ANY,ANY}(x) <- value
+
+\S4method{sample_names}{ANY,character}(x) <- value
+
+\S4method{sample_names}{otu_table,character}(x) <- value
+
+\S4method{sample_names}{sample_data,character}(x) <- value
+
+\S4method{sample_names}{phyloseq,character}(x) <- value
+}
+\arguments{
+\item{x}{(Required). An object defined by the \code{\link{phyloseq-package}}
+that describes OTUs in some way.}
+
+\item{value}{(Required). A character vector
+to replace the current \code{\link{sample_names}}.}
+}
+\description{
+Replace OTU identifier names
+}
+\examples{
+data("esophagus")
+sample_names(esophagus)
+# plot_tree(esophagus, color="sample_names", ladderize="left")
+sample_names(esophagus) <- paste("Sa-", sample_names(esophagus), sep="")
+sample_names(esophagus)
+# plot_tree(esophagus, color="sample_names", ladderize="left")
+## non-characters are first coerced to characters.
+sample_names(esophagus) <- 1:nsamples(esophagus)
+sample_names(esophagus)
+# plot_tree(esophagus, color="sample_names", ladderize="left")
+## Cannot assign non-unique or differently-lengthed name vectors. Error.
+# sample_names(esophagus) <- sample(c(TRUE, FALSE), nsamples(esophagus), TRUE)
+# sample_names(esophagus) <- sample(sample_names(esophagus), nsamples(esophagus)-1, FALSE)
+}
+
diff --git a/man/assign-tax_table.Rd b/man/assign-tax_table.Rd
new file mode 100644
index 0000000..e17d8ba
--- /dev/null
+++ b/man/assign-tax_table.Rd
@@ -0,0 +1,53 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/assignment-methods.R
+\docType{methods}
+\name{tax_table<-}
+\alias{assign-tax_table}
+\alias{tax_table<-}
+\alias{tax_table<-,phyloseq,ANY-method}
+\alias{tax_table<-,phyloseq,taxonomyTable-method}
+\alias{tax_table<-,taxonomyTable,ANY-method}
+\alias{tax_table<-,taxonomyTable,taxonomyTable-method}
+\title{Assign a (new) Taxonomy Table to \code{x}}
+\usage{
+tax_table(x) <- value
+
+\S4method{tax_table}{phyloseq,taxonomyTable}(x) <- value
+
+\S4method{tax_table}{phyloseq,ANY}(x) <- value
+
+\S4method{tax_table}{taxonomyTable,taxonomyTable}(x) <- value
+
+\S4method{tax_table}{taxonomyTable,ANY}(x) <- value
+}
+\arguments{
+\item{x}{(Required). \code{\link{phyloseq-class}}}
+
+\item{value}{(Required). \code{\link{taxonomyTable-class}}.
+Alternatively, \code{value} can be a \code{\link{phyloseq-class}} that has
+a \code{\link{tax_table}} component, or a \code{\link{matrix-class}}
+that can be coerced to a \code{\link{taxonomyTable-class}} with row indices
+that match at least some of the \code{\link{taxa_names}} of \code{x}.}
+}
+\description{
+Assign a (new) Taxonomy Table to \code{x}
+}
+\examples{
+# data(GlobalPatterns)
+# # An example of pruning to just the first 100 taxa in GlobalPatterns.
+# ex2a <- prune_taxa(taxa_names(GlobalPatterns)[1:100], GlobalPatterns)
+# # The following 3 lines produces an ex2b that is equal to ex2a
+# ex2b <- GlobalPatterns
+# TT <- tax_table(GlobalPatterns)[1:100, ]
+# tax_table(ex2b) <- TT
+# identical(ex2a, ex2b)
+# print(ex2b)
+# # 2 examples adding a tax_table component from phyloseq or matrix classes
+# ex2c <- phyloseq(otu_table(ex2b), sample_data(ex2b), phy_tree(ex2b))
+# tax_table(ex2c) <- ex2b
+# identical(ex2a, ex2c)
+# ex2c <- phyloseq(otu_table(ex2b), sample_data(ex2b), phy_tree(ex2b))
+# tax_table(ex2c) <- as(tax_table(ex2b), "matrix")
+# identical(ex2a, ex2c)
+}
+
diff --git a/man/assign-taxa_are_rows.Rd b/man/assign-taxa_are_rows.Rd
new file mode 100644
index 0000000..cf217da
--- /dev/null
+++ b/man/assign-taxa_are_rows.Rd
@@ -0,0 +1,33 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/assignment-methods.R
+\docType{methods}
+\name{taxa_are_rows<-}
+\alias{assign-taxa_are_rows}
+\alias{taxa_are_rows<-}
+\alias{taxa_are_rows<-,otu_table,logical-method}
+\alias{taxa_are_rows<-,phyloseq,logical-method}
+\title{Manually change taxa_are_rows through assignment.}
+\usage{
+taxa_are_rows(x) <- value
+
+\S4method{taxa_are_rows}{otu_table,logical}(x) <- value
+
+\S4method{taxa_are_rows}{phyloseq,logical}(x) <- value
+}
+\arguments{
+\item{x}{\code{\link{otu_table-class}} or \code{\link{phyloseq-class}}}
+
+\item{value}{A logical of length equal to 1. If \code{length(value) > 1},
+the additional elements will be ignored. Only the first element is assigned
+to the taxa_are_rows slot.}
+}
+\description{
+The taxa_are_rows slot is a logical indicating the orientation of the
+abundance table contained in object \code{x}.
+}
+\examples{
+ data(esophagus)
+ taxa_are_rows(esophagus)
+ taxa_are_rows(otu_table(esophagus))
+}
+
diff --git a/man/assign-taxa_names.Rd b/man/assign-taxa_names.Rd
new file mode 100644
index 0000000..3c82d5e
--- /dev/null
+++ b/man/assign-taxa_names.Rd
@@ -0,0 +1,57 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/assignment-methods.R
+\docType{methods}
+\name{taxa_names<-}
+\alias{assign-taxa_names}
+\alias{taxa_names<-}
+\alias{taxa_names<-,ANY,ANY-method}
+\alias{taxa_names<-,ANY,character-method}
+\alias{taxa_names<-,XStringSet,character-method}
+\alias{taxa_names<-,otu_table,character-method}
+\alias{taxa_names<-,phylo,character-method}
+\alias{taxa_names<-,phyloseq,character-method}
+\alias{taxa_names<-,taxonomyTable,character-method}
+\title{Replace OTU identifier names}
+\usage{
+taxa_names(x) <- value
+
+\S4method{taxa_names}{ANY,ANY}(x) <- value
+
+\S4method{taxa_names}{ANY,character}(x) <- value
+
+\S4method{taxa_names}{otu_table,character}(x) <- value
+
+\S4method{taxa_names}{taxonomyTable,character}(x) <- value
+
+\S4method{taxa_names}{phylo,character}(x) <- value
+
+\S4method{taxa_names}{XStringSet,character}(x) <- value
+
+\S4method{taxa_names}{phyloseq,character}(x) <- value
+}
+\arguments{
+\item{x}{(Required). An object defined by the \code{\link{phyloseq-package}}
+that describes OTUs in some way.}
+
+\item{value}{(Required). A character vector
+to replace the current \code{\link{taxa_names}}.}
+}
+\description{
+Replace OTU identifier names
+}
+\examples{
+data("esophagus")
+taxa_names(esophagus)
+# plot_tree(esophagus, label.tips="taxa_names", ladderize="left")
+taxa_names(esophagus) <- paste("OTU-", taxa_names(esophagus), sep="")
+taxa_names(esophagus)
+# plot_tree(esophagus, label.tips="taxa_names", ladderize="left")
+## non-characters are first coerced to characters.
+taxa_names(esophagus) <- 1:ntaxa(esophagus)
+taxa_names(esophagus)
+# plot_tree(esophagus, label.tips="taxa_names", ladderize="left")
+## Cannot assign non-unique or differently-lengthed name vectors. Error.
+# taxa_names(esophagus) <- sample(c(TRUE, FALSE), ntaxa(esophagus), TRUE)
+# taxa_names(esophagus) <- sample(taxa_names(esophagus), ntaxa(esophagus)-5, FALSE)
+}
+
diff --git a/man/build_tax_table.Rd b/man/build_tax_table.Rd
new file mode 100644
index 0000000..387fab7
--- /dev/null
+++ b/man/build_tax_table.Rd
@@ -0,0 +1,43 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/IO-methods.R
+\name{build_tax_table}
+\alias{build_tax_table}
+\title{Build a \code{\link{tax_table}} from a named possibly-jagged list}
+\usage{
+build_tax_table(taxlist)
+}
+\arguments{
+\item{taxlist}{(Required). A list in which each element is a vector of
+taxonomic assignments named by rank.
+Every element of every vector must be named by the rank it represents.
+Every element of the list (every vector) should correspond to a single OTU
+and be named for that OTU.}
+}
+\value{
+A \code{\link{tax_table}} (\code{\link{taxonomyTable-class}}) that
+ has been built from \code{taxlist}. The OTU names of this output will be
+ the element names of \code{taxlist}, and a separate taxonomic rank
+ (column) will be included for each unique rank found among the element names
+ of each vector in the list. \code{NA_character_} is the default value of
+ elements in the \code{\link{tax_table}} for which there is no corresponding
+ information in \code{taxlist}.
+}
+\description{
+Build a \code{\link{tax_table}} from a named possibly-jagged list
+}
+\examples{
+ taxvec1 = c("Root", "k__Bacteria", "p__Firmicutes", "c__Bacilli", "o__Bacillales", "f__Staphylococcaceae")
+ parse_taxonomy_default(taxvec1)
+ parse_taxonomy_greengenes(taxvec1)
+ taxvec2 = c("Root;k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Staphylococcaceae")
+ parse_taxonomy_qiime(taxvec2)
+ taxlist1 = list(OTU1=parse_taxonomy_greengenes(taxvec1), OTU2=parse_taxonomy_qiime(taxvec2))
+ taxlist2 = list(OTU1=parse_taxonomy_default(taxvec1), OTU2=parse_taxonomy_qiime(taxvec2))
+ build_tax_table(taxlist1)
+ build_tax_table(taxlist2)
+}
+\seealso{
+\code{\link{import_biom}}
+ \code{\link{import_qiime}}
+}
+
diff --git a/man/capscale-phyloseq-methods.Rd b/man/capscale-phyloseq-methods.Rd
new file mode 100644
index 0000000..530078e
--- /dev/null
+++ b/man/capscale-phyloseq-methods.Rd
@@ -0,0 +1,70 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ordination-methods.R
+\docType{methods}
+\name{capscale.phyloseq}
+\alias{capscale.phyloseq}
+\alias{capscale.phyloseq,phyloseq,formula,character-method}
+\alias{capscale.phyloseq,phyloseq,formula,dist-method}
+\title{Constrained Analysis of Principal Coordinates, \code{\link[vegan]{capscale}}.}
+\usage{
+capscale.phyloseq(physeq, formula, distance, ...)
+
+\S4method{capscale.phyloseq}{phyloseq,formula,dist}(physeq, formula, distance,
+ ...)
+
+\S4method{capscale.phyloseq}{phyloseq,formula,character}(physeq, formula,
+ distance, ...)
+}
+\arguments{
+\item{physeq}{(Required). Phylogenetic sequencing data
+(\code{\link{phyloseq-class}}).
+The data on which you want to perform the ordination.}
+
+\item{formula}{(Required). A \code{\link{formula}}, specifying the input.
+No need to directly access components. \code{capscale.phyloseq} understands
+where to find the abundance table (LHS) and \code{\link{sample_data}} (RHS)
+from within the phyloseq object.}
+
+\item{distance}{(Required). A \code{\link{character}} string, specifying
+the name of the dissimilarity (or distance) method supported by
+the phyloseq \code{\link[phyloseq]{distance}} function.
+Alternatively, a pre-computed \code{\link{dist}}-object can be provided here,
+in which case it supersedes any use of the \code{\link{otu_table}}
+in your phyloseq object.
+
+Note that \code{\link[vegan]{capscale}}
+with Euclidean distances will be identical to \code{\link[vegan]{rda}}
+in eigenvalues and in site, species, and biplot scores
+(except for possible sign reversal). However, it makes no sense to use
+\code{\link[vegan]{capscale}} with Euclidean distances,
+since direct use of \code{\link[vegan]{rda}} is much more efficient
+(and supported in the \code{\link{ordinate}} function with \code{method=="RDA"})
+Even with non-Euclidean dissimilarities,
+the rest of the analysis will be metric and linear.}
+
+\item{...}{(Optional). Additional named arguments passed to
+\code{\link[vegan]{capscale}}.}
+}
+\value{
+Ordination object defined by \code{\link[vegan]{capscale}}.
+}
+\description{
+See \code{\link[vegan]{capscale}} for details. A formula is main input.
+}
+\examples{
+# See other examples at
+# http://joey711.github.io/phyloseq/plot_ordination-examples
+data(GlobalPatterns)
+GP = prune_taxa(names(sort(taxa_sums(GlobalPatterns), TRUE)[1:50]), GlobalPatterns)
+ordcap = ordinate(GP, "CAP", "bray", ~SampleType)
+plot_ordination(GP, ordcap, "samples", color="SampleType")
+}
+\seealso{
+\code{\link{plot_ordination}}
+
+ \code{\link[vegan]{rda}}
+
+ \code{\link[vegan]{capscale}}
+}
+\keyword{internal}
+
diff --git a/man/cca-rda-phyloseq-methods.Rd b/man/cca-rda-phyloseq-methods.Rd
new file mode 100644
index 0000000..e7c4810
--- /dev/null
+++ b/man/cca-rda-phyloseq-methods.Rd
@@ -0,0 +1,60 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ordination-methods.R
+\docType{methods}
+\name{cca.phyloseq}
+\alias{cca.phyloseq}
+\alias{cca.phyloseq,otu_table,ANY-method}
+\alias{cca.phyloseq,otu_table-method}
+\alias{cca.phyloseq,phyloseq,NULL-method}
+\alias{cca.phyloseq,phyloseq,formula-method}
+\alias{rda.phyloseq}
+\title{Constrained Correspondence Analysis and Redundancy Analysis.}
+\usage{
+cca.phyloseq(physeq, formula = NULL, method = "CCA", ...)
+
+\S4method{cca.phyloseq}{phyloseq,formula}(physeq, formula = NULL,
+ method = "CCA", ...)
+
+\S4method{cca.phyloseq}{otu_table,ANY}(physeq, formula = NULL,
+ method = "CCA", ...)
+
+\S4method{cca.phyloseq}{phyloseq,`NULL`}(physeq, formula = NULL,
+ method = "CCA", ...)
+}
+\arguments{
+\item{physeq}{(Required). Phylogenetic sequencing data
+(\code{\link{phyloseq-class}}).
+The data on which you want to perform the ordination.}
+
+\item{formula}{(Optional). A \code{\link{formula}},
+specifying the contraining variable(s) format,
+with variable names corresponding to \code{\link{sample_data}} (RHS)
+from within \code{physeq}.}
+
+\item{method}{(Optional). A single \code{\link{character}} string,
+specifying \code{"RDA"} or \code{"CCA"}. Default is \code{"CCA"}.}
+
+\item{...}{(Optional). Additional named arguments passed to
+\code{\link[vegan]{capscale}}.}
+}
+\value{
+same output as \code{\link[vegan]{cca}}
+ or \code{\link[vegan]{rda}}, respectively.
+}
+\description{
+This is the internal function that simplifies getting phyloseq data
+into the constrained ordination functions,
+\code{\link[vegan]{cca}} and \code{\link[vegan]{rda}}.
+Unlike \code{\link[phyloseq]{capscale.phyloseq}}, the formula argument
+to these methods is optional, and results in an unconstrained ordination.
+}
+\examples{
+#
+# cca.phyloseq(physeq, formula, method, ...)
+}
+\seealso{
+\code{\link{plot_ordination}},
+ \code{\link[vegan]{rda}}, \code{\link[vegan]{cca}}
+}
+\keyword{internal}
+
diff --git a/man/chunkReOrder.Rd b/man/chunkReOrder.Rd
new file mode 100644
index 0000000..be5e8d1
--- /dev/null
+++ b/man/chunkReOrder.Rd
@@ -0,0 +1,35 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/plot-methods.R
+\name{chunkReOrder}
+\alias{chunkReOrder}
+\title{Chunk re-order a vector so that specified newstart is first.}
+\usage{
+chunkReOrder(x, newstart = x[[1]])
+}
+\description{
+Different than relevel.
+}
+\examples{
+# Typical use-case
+# chunkReOrder(1:10, 5)
+# # Default is to not modify the vector
+# chunkReOrder(1:10)
+# # Another example not starting at 1
+# chunkReOrder(10:25, 22)
+# # Should silently ignore the second element of `newstart`
+# chunkReOrder(10:25, c(22, 11))
+# # Should be able to handle `newstart` being the first argument already
+# # without duplicating the first element at the end of `x`
+# chunkReOrder(10:25, 10)
+# all(chunkReOrder(10:25, 10) == 10:25)
+# # This is also the default
+# all(chunkReOrder(10:25) == 10:25)
+# # An example with characters
+# chunkReOrder(LETTERS, "G")
+# chunkReOrder(LETTERS, "B")
+# chunkReOrder(LETTERS, "Z")
+# # What about when `newstart` is not in `x`? Return x as-is, throw warning.
+# chunkReOrder(LETTERS, "g")
+}
+\keyword{internal}
+
diff --git a/man/data-GlobalPatterns.Rd b/man/data-GlobalPatterns.Rd
new file mode 100644
index 0000000..9219bf3
--- /dev/null
+++ b/man/data-GlobalPatterns.Rd
@@ -0,0 +1,50 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/allData.R
+\docType{data}
+\name{data-GlobalPatterns}
+\alias{GlobalPatterns}
+\alias{data-GlobalPatterns}
+\title{(Data) Global patterns of 16S rRNA diversity at a depth of millions of sequences per sample (2011)}
+\description{
+Published in PNAS in early 2011. This work compared the microbial
+communities from 25 environmental samples and three known ``mock communities''
+-- a total of 9 sample types -- at a depth averaging 3.1 million reads per sample.
+Authors were able to reproduce diversity patterns seen in many other
+published studies, while also invesitigating technical issues/bias by
+applying the same techniques to simulated microbial communities of known
+composition.
+}
+\details{
+abstract from research article (quoted):
+
+The ongoing revolution in high-throughput sequencing continues to democratize the ability of small groups of investigators to map the microbial component of the biosphere. In particular, the coevolution of new sequencing platforms and new software tools allows data acquisition and analysis on an unprecedented scale. Here we report the next stage in this coevolutionary arms race, using the Illumina GAIIx platform to sequence a diverse array of 25 environmental samples and three known ``mo [...]
+
+(end quote)
+
+Many thanks to J. Gregory Caporaso for directly providing the OTU-clustered data files
+for inclusion in this package.
+}
+\examples{
+data(GlobalPatterns)
+plot_richness(GlobalPatterns, x="SampleType", measures=c("Observed", "Chao1", "Shannon"))
+}
+\author{
+Caporaso, J. G., et al.
+}
+\references{
+Caporaso, J. G., et al. (2011).
+Global patterns of 16S rRNA diversity at a depth of millions of sequences per sample.
+PNAS, 108, 4516-4522.
+PMCID: PMC3063599
+
+The primary article can be viewed/downloaded at:
+\url{http://www.pnas.org/content/108/suppl.1/4516.short}
+}
+\seealso{
+The examples on the phyloseq wiki page for \code{\link{plot_ordination}} show
+ many more examples:
+
+\url{https://github.com/joey711/phyloseq/wiki/plot_ordination}
+}
+\keyword{data}
+
diff --git a/man/data-enterotype.Rd b/man/data-enterotype.Rd
new file mode 100644
index 0000000..c9fa9e6
--- /dev/null
+++ b/man/data-enterotype.Rd
@@ -0,0 +1,48 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/allData.R
+\docType{data}
+\name{data-enterotype}
+\alias{data-enterotype}
+\alias{enterotype}
+\title{(Data) Enterotypes of the human gut microbiome (2011)}
+\description{
+Published in Nature in early 2011, this work compared (among other things),
+the faecal microbial communities from 22
+subjects using complete shotgun DNA sequencing.
+Authors further compared these microbial communities with the faecal
+communities of subjects from other studies. A total of 280 faecal samples / subjects
+are represented in this dataset, and 553 genera. The authors claim that the
+data naturally clumps into three community-level clusters, or ``enterotypes'',
+that are not immediately explained by sequencing technology or demographic
+features of the subjects, but with potential relevance to understanding
+human gut microbiota.
+}
+\details{
+abstract from research article (quoted):
+
+Our knowledge of species and functional composition of the human gut microbiome is rapidly increasing, but it is still based on very few cohorts and little is known about variation across the world. By combining 22 newly sequenced faecal metagenomes of individuals from four countries with previously published data sets, here we identify three robust clusters (referred to as enterotypes hereafter) that are not nation or continent specific. We also confirmed the enterotypes in two publishe [...]
+
+(end quote)
+}
+\examples{
+data(enterotype)
+ig <- make_network(enterotype, "samples", max.dist=0.3)
+plot_network(ig, enterotype, color="SeqTech", shape="Enterotype", line_weight=0.3, label=NULL)
+}
+\author{
+Arumugam, M., Raes, J., et al.
+}
+\references{
+Arumugam, M., et al. (2011). Enterotypes of the human gut microbiome.
+
+Nature, 473(7346), 174-180.
+
+\url{http://www.nature.com/doifinder/10.1038/nature09944}
+See supplemental information for subject data.
+
+OTU-clustered data was downloaded from the publicly-accessible:
+
+\url{http://www.bork.embl.de/Docu/Arumugam_et_al_2011/downloads.html}
+}
+\keyword{data}
+
diff --git a/man/data-esophagus.Rd b/man/data-esophagus.Rd
new file mode 100644
index 0000000..39cca6b
--- /dev/null
+++ b/man/data-esophagus.Rd
@@ -0,0 +1,49 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/allData.R
+\docType{data}
+\name{data-esophagus}
+\alias{data-esophagus}
+\alias{esophagus}
+\title{(Data) Small example dataset from a human esophageal community (2004)}
+\description{
+Includes just 3 samples, 1 each from 3 subjects. Although the research article mentions 4 subjects,
+only 3 are included in this dataset.
+}
+\details{
+abstract from research article (quoted):
+
+The esophagus, like other luminal organs of the digestive system, provides a potential environment for bacterial colonization, but little is known about the presence of a bacterial biota or its nature. By using broad-range 16S rDNA PCR, biopsies were examined from the normal esophagus of four human adults. The 900 PCR products cloned represented 833 unique sequences belonging to 41 genera, or 95 species-level operational taxonomic units (SLOTU); 59 SLOTU were homologous with culture-defi [...]
+
+(end quote)
+
+A description of the 16S rRNA sequence processing can be found on the mothur-wiki
+at the link below. A cutoff of 0.10 was used for OTU clustering in that example,
+and it is taken here as well to create example data, \code{esophagus}, which was
+easily imported with the \code{import_mothur()} function.
+}
+\examples{
+data(esophagus)
+UniFrac(esophagus, weighted=TRUE)
+# How to re-create the esophagus dataset using import_mothur function
+mothlist <- system.file("extdata", "esophagus.fn.list.gz", package="phyloseq")
+mothgroup <- system.file("extdata", "esophagus.good.groups.gz", package="phyloseq")
+mothtree <- system.file("extdata", "esophagus.tree.gz", package="phyloseq")
+show_mothur_cutoffs(mothlist)
+cutoff <- "0.10"
+esophman <- import_mothur(mothlist, mothgroup, mothtree, cutoff)
+}
+\author{
+Pei et al. \email{zhiheng.pei at med.nyu.edu}
+}
+\references{
+Pei, Z., Bini, E. J., Yang, L., Zhou, M., Francois, F., & Blaser, M. J. (2004).
+Bacterial biota in the human distal esophagus.
+Proceedings of the National Academy of Sciences of the United States of America, 101(12), 4250-4255.
+\url{http://www.ncbi.nlm.nih.gov/pmc/articles/PMC384727}
+
+mothur-processed files and the sequence data can be downloaded from a zip-file,
+along with additional description, from the following URL:
+\url{http://www.mothur.org/wiki/Esophageal_community_analysis}
+}
+\keyword{data}
+
diff --git a/man/data-soilrep.Rd b/man/data-soilrep.Rd
new file mode 100644
index 0000000..150c695
--- /dev/null
+++ b/man/data-soilrep.Rd
@@ -0,0 +1,84 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/allData.R
+\docType{data}
+\name{data-soilrep}
+\alias{data-soilrep}
+\alias{soilrep}
+\title{(Data) Reproducibility of soil microbiome data (2011)}
+\description{
+Published in early 2011,
+this work compared 24 separate soil microbial communities under four treatment
+conditions via multiplexed/barcoded 454-pyrosequencing of PCR-amplified 16S rRNA gene fragments.
+The authors found differences in the composition and structure of microbial
+communities between soil treatments.
+As expected, the soil microbial communities were highly diverse, with a staggering
+16,825 different OTUs (species) observed in the included dataset.
+Interestingly, this study used a larger number of replicates than previous studies of this type,
+for a total of 56 samples, and the putatively low resampling rate of species
+between replicated sequencing trials (``OTU overlap'') was a major concern by
+the authors.
+}
+\details{
+This dataset contains an experiment-level (\code{\link{phyloseq-class}}) object,
+which in turn contains the taxa-contingency table and soil-treatment table
+as \code{\link{otu_table-class}} and \code{\link{sample_data-class}} components, respectively.
+
+This data was
+imported from raw files supplied directly by the authors via personal communication
+for the purposes of including as an example in the \code{\link{phyloseq-package}}.
+As this data is sensitive to choices in OTU-clustering parameters, attempts to recreate
+the \code{otu_table} from the raw sequencing data may give slightly different results
+than the table provided here.
+
+abstract from research article (quoted):
+
+To determine the reproducibility and quantitation of the amplicon sequencing-based
+detection approach for analyzing microbial community structure, a total of 24 microbial
+communities from a long-term global change experimental site were examined. Genomic DNA
+obtained from each community was used to amplify 16S rRNA genes with two or three
+barcode tags as technical replicates in the presence of a small quantity (0.1\% wt/wt)
+of genomic DNA from Shewanella oneidensis MR-1 as the control. The technical
+reproducibility of the amplicon sequencing-based detection approach is quite low,
+with an average operational taxonomic unit (OTU) overlap of 17.2\%\code{+/-}2.3\%
+between two technical replicates, and 8.2\%\code{+/-}2.3\% among three technical
+replicates, which is most likely due to problems associated with random sampling processes.
+Such variations in technical replicates could have substantial effects on estimating
+beta-diversity but less on alpha-diversity. A high variation was also observed in the
+control across different samples (for example, 66.7-fold for the forward primer),
+suggesting that the amplicon sequencing-based detection approach could not be quantitative.
+In addition, various strategies were examined to improve the comparability of amplicon
+sequencing data, such as increasing biological replicates, and removing singleton sequences
+and less-representative OTUs across biological replicates. Finally, as expected, various
+statistical analyses with preprocessed experimental data revealed clear differences in
+the composition and structure of microbial communities between warming and non-warming,
+or between clipping and non-clipping. Taken together, these results suggest that amplicon
+sequencing-based detection is useful in analyzing microbial community structure even
+though it is not reproducible and quantitative. However, great caution should be taken
+in experimental design and data interpretation when the amplicon sequencing-based detection
+approach is used for quantitative analysis of the beta-diversity of microbial communities.
+
+(end quote)
+}
+\examples{
+# Load the data
+data(soilrep)
+################################################################################
+# Alpha diversity (richness) example. Accept null hypothesis:
+# No convincing difference in species richness between warmed/unwarmed soils.
+################################################################################
+# Graphically compare richness between the different treatments.
+man.col <- c(WC="red", WU="brown", UC="blue", UU="darkgreen")
+plot_richness(soilrep, x="Treatment", color="Treatment", measures=c("Observed", "Chao1", "Shannon"))
+}
+\author{
+Jizhong Zhou, et al.
+}
+\references{
+Zhou, J., Wu, L., Deng, Y., Zhi, X., Jiang, Y.-H., Tu, Q., Xie, J., et al.
+ Reproducibility and quantitation of amplicon sequencing-based detection.
+ The ISME Journal. (2011) 5(8):1303-1313. \code{doi:10.1038/ismej.2011.11}
+
+The article can be accessed online at \url{http://www.nature.com/ismej/journal/v5/n8/full/ismej201111a.html}
+}
+\keyword{data}
+
diff --git a/man/decorana.Rd b/man/decorana.Rd
new file mode 100644
index 0000000..45fc5aa
--- /dev/null
+++ b/man/decorana.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/allClasses.R
+\docType{data}
+\name{decorana}
+\alias{decorana}
+\title{S3 class placeholder definition (list) for decorana}
+\format{An object of class \code{decorana} of length 0.}
+\usage{
+decorana
+}
+\description{
+The ape package does export a version of its \code{\link[vegan]{decorana}}-class,
+partly because it is not really defined formally anywhere.
+Instead, it is an S3 class extended from the base class, \code{\link{list}} --
+this is a very common and easy approach --
+and proper behavior of any method taking an instance of this class
+requires exact naming conventions for element names of the list components.
+The phyloseq package does not provide any validity checks that a given phylo
+instance is valid (conforms to the conventions in the ape package)... yet.
+If problems arise, this might be considered, and they could be defined
+judiciously and within phyloseq.
+}
+\seealso{
+\code{\link[vegan]{decorana}}
+}
+\keyword{internal}
+
diff --git a/man/dist-class.Rd b/man/dist-class.Rd
new file mode 100644
index 0000000..68745b2
--- /dev/null
+++ b/man/dist-class.Rd
@@ -0,0 +1,13 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/allClasses.R
+\name{dist-class}
+\alias{dist-class}
+\title{An S4 placeholder for the \code{\link[stats]{dist}} class.}
+\description{
+See \code{\link[stats]{dist}} for details
+about this type of a distance matrix object.
+}
+\seealso{
+\code{\link[stats]{dist}}, \code{\link{setOldClass}}
+}
+
diff --git a/man/distance.Rd b/man/distance.Rd
new file mode 100644
index 0000000..7d1cb00
--- /dev/null
+++ b/man/distance.Rd
@@ -0,0 +1,110 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/distance-methods.R
+\docType{methods}
+\name{distance}
+\alias{distance}
+\alias{distance,otu_table,character-method}
+\alias{distance,phyloseq,ANY-method}
+\alias{distance,phyloseq,character-method}
+\title{Calculate distance, dissimilarity}
+\usage{
+distance(physeq, method, type = "samples", ...)
+
+\S4method{distance}{phyloseq,ANY}(physeq, method)
+
+\S4method{distance}{otu_table,character}(physeq, method, type = "samples",
+ ...)
+
+\S4method{distance}{phyloseq,character}(physeq, method, type = "samples", ...)
+}
+\arguments{
+\item{physeq}{(Required). A \code{\link{phyloseq-class}} or
+an \code{\link{otu_table-class}} object. The latter is only appropriate
+for methods that do not require any additional data (one-table).
+For example, the ``wunifrac'' option (\code{\link{UniFrac}}) requires
+\code{\link{phyloseq-class}} that contains both an \code{otu_table}
+and a phylogenetic tree (\code{phylo}).}
+
+\item{method}{(Required). A character string.
+ Provide one of the currently supported options.
+ See \code{\link{distanceMethodList}} for a detailed list
+ of the supported options here,
+ and links to accompanying documentation.
+
+ Note that for the common definition of \code{Jaccard} distance
+ using the \code{vegan-package} implementation,
+ an additional argument is needed, with the full call having the form:
+ \code{distance(physeq, method = "jaccard", binary = TRUE)}
+
+ The following methods are implemented explicitly within
+ the \code{\link{phyloseq-package}},
+ and accessed by the following \code{method} options:
+
+ \describe{
+ \item{\code{"unifrac"}}{Original (unweighted) UniFrac distance,
+ \code{\link[phyloseq]{UniFrac}}}
+ \item{\code{"wunifrac"}}{weighted-UniFrac distance, \code{\link[phyloseq]{UniFrac}}}
+ \item{\code{"dpcoa"}}{
+ sample-wise distance used in
+ Double Principle Coordinate Analysis, \code{\link[phyloseq]{DPCoA}}}
+ \item{\code{"jsd"}}{Jensen-Shannon Divergence, \code{\link{JSD}}}
+ }
+
+ Alternatively, you can provide
+ a character string that defines a custom distance method, if it has the form
+ described in \code{\link{designdist}}.}
+
+\item{type}{(Optional). A character string. The type of pairwise comparisons
+being calculated: sample-wise or taxa-wise. The default is
+\code{c("samples")}.}
+
+\item{...}{Additional arguments passed on to the appropriate distance
+function, determined by the \code{method} argument.}
+}
+\value{
+An object of class ``\code{\link{dist}}'' suitable for certain
+ ordination methods and other distance-based analyses.
+}
+\description{
+Takes a \code{\link{phyloseq-class}} object and method option, and returns
+ a \code{\link{dist}}ance object suitable for certain
+ ordination methods and other distance-based analyses.
+ Only
+ sample-wise distances are currently supported (the \code{type} argument),
+ but eventually species-wise (OTU-wise)
+ distances may be supported as well.
+}
+\details{
+Depending on the \code{method}
+ argument, \code{distance()} wraps one of
+ \code{\link{UniFrac}},
+ \code{\link{DPCoA}},
+ \code{\link{JSD}},
+ \code{\link[vegan]{vegdist}},
+ \code{\link[vegan]{betadiver}},
+ \code{\link[vegan]{designdist}}, or
+ \code{\link{dist}}.
+}
+\examples{
+data(esophagus)
+distance(esophagus, "uunifrac") # Unweighted UniFrac
+distance(esophagus, "wunifrac") # weighted UniFrac
+distance(esophagus, "jaccard", binary = TRUE) # vegdist jaccard
+distance(esophagus, "gower") # vegdist option "gower"
+distance(esophagus, "g") # designdist method option "g"
+distance(esophagus, "minkowski") # invokes a method from the base dist() function.
+distance(esophagus, "(A+B-2*J)/(A+B)") # designdist custom distance
+distanceMethodList
+help("distance")
+}
+\seealso{
+\code{\link{plot_ordination}},
+ \code{\link{UniFrac}},
+ \code{\link{DPCoA}},
+ \code{\link{JSD}},
+ \code{\link[vegan]{vegdist}},
+ \code{\link[vegan]{betadiver}},
+ \code{\link[vegan]{designdist}},
+ \code{\link{dist}}.
+}
+
diff --git a/man/distanceMethodList.Rd b/man/distanceMethodList.Rd
new file mode 100644
index 0000000..2e154a6
--- /dev/null
+++ b/man/distanceMethodList.Rd
@@ -0,0 +1,76 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/distance-methods.R
+\docType{data}
+\name{distanceMethodList}
+\alias{distanceMethodList}
+\title{List of distance method keys supported in \code{\link[phyloseq]{distance}}}
+\format{A list of character vectors.
+Every entry specifies a supported distance method.
+Names in the list indicate which downstream function
+is being utilized for further details.
+Same functions are linked in the itemized list below.
+
+\describe{
+ \item{\code{unifrac}}{\code{\link[phyloseq]{UniFrac}}}
+ \item{\code{wunifrac}}{\code{\link[phyloseq]{UniFrac}}}
+ \item{\code{dpcoa}}{\code{\link[phyloseq]{DPCoA}}}
+ \item{\code{jsd}}{\code{\link{JSD}}}
+ \item{\code{manhattan}}{\code{\link[vegan]{vegdist}}}
+ \item{\code{euclidean}}{\code{\link[vegan]{vegdist}}}
+ \item{\code{canberra}}{\code{\link[vegan]{vegdist}}}
+ \item{\code{bray}}{\code{\link[vegan]{vegdist}}}
+ \item{\code{kulczynski}}{\code{\link[vegan]{vegdist}}}
+ \item{\code{jaccard}}{\code{\link[vegan]{vegdist}}}
+ \item{\code{gower}}{\code{\link[vegan]{vegdist}}}
+ \item{\code{altGower}}{\code{\link[vegan]{vegdist}}}
+ \item{\code{morisita}}{\code{\link[vegan]{vegdist}}}
+ \item{\code{horn}}{\code{\link[vegan]{vegdist}}}
+ \item{\code{mountford}}{\code{\link[vegan]{vegdist}}}
+ \item{\code{raup}}{\code{\link[vegan]{vegdist}}}
+ \item{\code{binomial}}{\code{\link[vegan]{vegdist}}}
+ \item{\code{chao}}{\code{\link[vegan]{vegdist}}}
+ \item{\code{cao}}{\code{\link[vegan]{vegdist}}}
+ \item{\code{w}}{\code{\link[vegan]{betadiver}}}
+ \item{\code{-}}{\code{\link[vegan]{betadiver}}}
+ \item{\code{c}}{\code{\link[vegan]{betadiver}}}
+ \item{\code{wb}}{\code{\link[vegan]{betadiver}}}
+ \item{\code{r}}{\code{\link[vegan]{betadiver}}}
+ \item{\code{I}}{\code{\link[vegan]{betadiver}}}
+ \item{\code{e}}{\code{\link[vegan]{betadiver}}}
+ \item{\code{t}}{\code{\link[vegan]{betadiver}}}
+ \item{\code{me}}{\code{\link[vegan]{betadiver}}}
+ \item{\code{j}}{\code{\link[vegan]{betadiver}}}
+ \item{\code{sor}}{\code{\link[vegan]{betadiver}}}
+ \item{\code{m}}{\code{\link[vegan]{betadiver}}}
+ \item{\code{-}}{\code{\link[vegan]{betadiver}}}
+ \item{\code{co}}{\code{\link[vegan]{betadiver}}}
+ \item{\code{cc}}{\code{\link[vegan]{betadiver}}}
+ \item{\code{g}}{\code{\link[vegan]{betadiver}}}
+ \item{\code{-}}{\code{\link[vegan]{betadiver}}}
+ \item{\code{l}}{\code{\link[vegan]{betadiver}}}
+ \item{\code{hk}}{\code{\link[vegan]{betadiver}}}
+ \item{\code{rlb}}{\code{\link[vegan]{betadiver}}}
+ \item{\code{sim}}{\code{\link[vegan]{betadiver}}}
+ \item{\code{gl}}{\code{\link[vegan]{betadiver}}}
+ \item{\code{z}}{\code{\link[vegan]{betadiver}}}
+ \item{\code{maximum}}{\code{\link[stats]{dist}}}
+ \item{\code{binary}}{\code{\link[stats]{dist}}}
+ \item{\code{minkowski}}{\code{\link[stats]{dist}}}
+ \item{\code{ANY}}{\code{\link[vegan]{designdist}}}
+}}
+\usage{
+distanceMethodList
+}
+\description{
+Distance methods should be specified by exact string match.
+Cannot do partial matching for all options,
+because too many similar options in downstream method dispatch.
+}
+\examples{
+distanceMethodList
+}
+\seealso{
+\code{\link[phyloseq]{distance}}
+}
+\keyword{datasets}
+
diff --git a/man/envHash2otu_table.Rd b/man/envHash2otu_table.Rd
new file mode 100644
index 0000000..5ce4cf8
--- /dev/null
+++ b/man/envHash2otu_table.Rd
@@ -0,0 +1,47 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/IO-methods.R
+\name{envHash2otu_table}
+\alias{envHash2otu_table}
+\title{Convert a sequence-sample hash (like ENV file) into an OTU table.}
+\usage{
+envHash2otu_table(tipSampleTable)
+}
+\arguments{
+\item{tipSampleTable}{(Required). A two-column character table (matrix or data.frame),
+where each row specifies the sequence name and source sample, consistent with the
+env-file for the UniFrac server (\url{http://bmf2.colorado.edu/unifrac/}).}
+}
+\value{
+\code{\link{otu_table}}. A trivial OTU table where each sequence
+ is treated as a separate OTU.
+}
+\description{
+Parses an ENV-file into a sparse matrix of species-by-sample, where
+each species-row has only one non-zero value. We call this sparse abundance
+table the trivial OTU table, where every sequence is treated as a separate
+species. If a phylogenetic tree is available, it can be submitted with this
+table as arguments to \code{\link{tip_glom}} to create an object with a
+non-trivial \code{otu_table}.
+}
+\examples{
+#
+## fakeSeqNameVec <- paste("seq_", 1:8, sep="")
+## fakeSamNameVec <- c(rep("A", 4), rep("B", 4))
+## fakeSeqAbunVec <- sample(1:50, 8, TRUE)
+## test <- cbind(fakeSeqNameVec, fakeSamNameVec, fakeSeqAbunVec)
+## testotu <- envHash2otu_table( test )
+## test <- cbind(fakeSeqNameVec, fakeSamNameVec)
+## testotu <- envHash2otu_table( test )
+}
+\references{
+\url{http://bmf2.colorado.edu/unifrac/}
+}
+\seealso{
+\code{\link{import_env_file}}
+
+\code{\link{tip_glom}}
+
+\code{\link{otu_table}}
+}
+\keyword{internal}
+
diff --git a/man/estimate_richness.Rd b/man/estimate_richness.Rd
new file mode 100644
index 0000000..3c2d044
--- /dev/null
+++ b/man/estimate_richness.Rd
@@ -0,0 +1,64 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/extend_vegan.R
+\name{estimate_richness}
+\alias{estimate_richness}
+\title{Summarize alpha diversity}
+\usage{
+estimate_richness(physeq, split = TRUE, measures = NULL)
+}
+\arguments{
+\item{physeq}{(Required). \code{\link{phyloseq-class}}, or alternatively,
+an \code{\link{otu_table-class}}. The data about which you want to estimate
+the richness.}
+
+\item{split}{(Optional). Logical. Should a separate set of richness estimates
+be performed for each sample? Or alternatively, pool all samples and
+estimate richness of the entire set.}
+
+\item{measures}{(Optional). Default is \code{NULL}, meaning that
+all available alpha-diversity measures will be included.
+Alternatively, you can specify one or more measures
+as a character vector of measure names.
+Values must be among those supported:
+\code{c("Observed", "Chao1", "ACE", "Shannon", "Simpson", "InvSimpson", "Fisher")}.}
+}
+\value{
+A \code{data.frame} of the richness estimates, and their standard error.
+}
+\description{
+Performs a number of standard alpha diversity estimates,
+and returns the results as a \code{data.frame}.
+Strictly speaking, this function is not only estimating richness,
+despite its name.
+It can operate on the cumulative population of all
+samples in the dataset, or by repeating the richness estimates for each
+sample individually.
+NOTE: You must use untrimmed datasets
+for meaningful results, as these estimates (and even the ``observed'' richness)
+are highly dependent on the number of singletons. You can always trim the data
+later on if needed, just not before using this function.
+}
+\examples{
+## There are many more interesting examples at the phyloseq online tutorials.
+## http://joey711.github.com/phyloseq/plot_richness-examples
+ data("esophagus")
+ # Default is all available measures
+ estimate_richness(esophagus)
+ # Specify just one:
+ estimate_richness(esophagus, measures="Observed")
+ # Specify a few:
+ estimate_richness(esophagus, measures=c("Observed", "InvSimpson", "Shannon", "Chao1"))
+}
+\seealso{
+Check out the custom plotting function, \code{\link{plot_richness}},
+ for easily showing the results of different estimates,
+ with method-specific error-bars.
+ Also check out the internal functions borrowed from the \code{vegan} package:
+
+ \code{\link[vegan]{estimateR}}
+
+ \code{\link[vegan]{diversity}}
+
+ \code{\link[vegan]{fisherfit}}
+}
+
diff --git a/man/export_env_file.Rd b/man/export_env_file.Rd
new file mode 100644
index 0000000..bbc40a6
--- /dev/null
+++ b/man/export_env_file.Rd
@@ -0,0 +1,36 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/IO-methods.R
+\name{export_env_file}
+\alias{export_env_file}
+\title{Export environment (ENV) file for UniFrac Server.}
+\usage{
+export_env_file(physeq, file = "", writeTree = TRUE, return = FALSE)
+}
+\arguments{
+\item{physeq}{(Required). Experiment-level (\code{\link{phyloseq-class}}) object.
+Ideally this also contains the phylogenetic tree, which is also exported by default.}
+
+\item{file}{(Optional). The file path for export. If not-provided, the
+expectation is that you will want to set \code{return} to \code{TRUE},
+and manipulate the ENV table on your own. Default is \code{""}, skipping
+the ENV file from being written to a file.}
+
+\item{writeTree}{(Optional). Write the phylogenetic tree as well as the
+the ENV table. Default is \code{TRUE}.}
+
+\item{return}{(Optional). Should the ENV table be returned to the R workspace?
+Default is \code{FALSE}.}
+}
+\description{
+Creates the environment table that is needed for the original UniFrac
+algorithm. Useful for cross-checking, or if want to use UniFrac server.
+Optionally the ENV-formatted table can be returned to the \code{R}
+workspace, and the tree component can be exported as Nexus format
+(Recommended).
+}
+\examples{
+# # Load example data
+# data(esophagus)
+# export_env_file(esophagus, "~/Desktop/esophagus.txt")
+}
+
diff --git a/man/export_mothur_dist.Rd b/man/export_mothur_dist.Rd
new file mode 100644
index 0000000..12c9fcc
--- /dev/null
+++ b/man/export_mothur_dist.Rd
@@ -0,0 +1,40 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/IO-methods.R
+\name{export_mothur_dist}
+\alias{export_mothur_dist}
+\title{Export a distance object as \code{.names} and \code{.dist} files for mothur}
+\usage{
+export_mothur_dist(x, out=NULL, makeTrivialNamesFile=NULL)
+}
+\arguments{
+\item{x}{(Required). A \code{"dist"} object, or a symmetric matrix.}
+
+\item{out}{(Optional). The desired output filename for the \code{.dist} file, OR
+left \code{NULL}, the default, in which case the mothur-formated distance table
+is returned to \code{R} standard out.}
+
+\item{makeTrivialNamesFile}{(Optional). Default \code{NULL}. The desired name of the \code{.names} file.
+If left \code{NULL}, the file name will be a modified version of the \code{out} argument.}
+}
+\value{
+A character vector of the different cutoff values contained in the file.
+ For a given set of arguments to the \code{cluster()} command from within
+ \emph{mothur}, a number of OTU-clustering results are returned in the same
+ list file. The exact cutoff values used by \emph{mothur} can vary depending
+ on the input data. This simple function returns the cutoffs that were actually
+ included in the \emph{mothur} output. This an important extra step prior to
+ importing the OTUs with the \code{import_mothur_otulist()} function.
+}
+\description{
+The purpose of this function is to allow a user to easily export a distance object
+as a pair of files that can be immediately imported by mothur for OTU clustering
+and related analysis. A distance object can be created in \code{R} in a number of
+ways, including via cataloguing the cophentic distances of a tree object.
+}
+\examples{
+#
+data(esophagus)
+myDistObject <- as.dist(ape::cophenetic.phylo(phy_tree(esophagus)))
+export_mothur_dist(myDistObject)
+}
+
diff --git a/man/extract-methods.Rd b/man/extract-methods.Rd
new file mode 100644
index 0000000..bcb458e
--- /dev/null
+++ b/man/extract-methods.Rd
@@ -0,0 +1,80 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/extract-methods.R
+\docType{methods}
+\name{[,otu_table,ANY,ANY,ANY-method}
+\alias{[,XStringSet,character,ANY,ANY-method}
+\alias{[,otu_table,ANY,ANY,ANY-method}
+\alias{[,sample_data,ANY,ANY,ANY-method}
+\alias{[,taxonomyTable,ANY,ANY,ANY-method}
+\title{Method extensions to extraction operator for phyloseq objects.}
+\usage{
+\S4method{[}{otu_table,ANY,ANY,ANY}(x, i, j, ..., drop = TRUE)
+
+\S4method{[}{sample_data,ANY,ANY,ANY}(x, i, j, ..., drop = TRUE)
+
+\S4method{[}{taxonomyTable,ANY,ANY,ANY}(x, i, j, ..., drop = TRUE)
+
+\S4method{[}{XStringSet,character,ANY,ANY}(x, i)
+}
+\arguments{
+\item{x}{
+ object from which to extract element(s) or in which to replace element(s).
+ }
+
+\item{i}{
+ indices specifying elements to extract or replace. Indices are
+ \code{numeric} or \code{character} vectors or empty (missing) or
+ \code{NULL}. Numeric values are coerced to integer as by
+ \code{\link{as.integer}} (and hence truncated towards zero).
+ Character vectors will be matched to the \code{\link{names}} of the
+ object (or for matrices/arrays, the \code{\link{dimnames}}):
+ see \sQuote{Character indices} below for further details.
+
+ For \code{[}-indexing only: \code{i}, \code{j}, \code{\dots} can be
+ logical vectors, indicating elements/slices to select. Such vectors
+ are recycled if necessary to match the corresponding extent.
+ \code{i}, \code{j}, \code{\dots} can also be negative integers,
+ indicating elements/slices to leave out of the selection.
+
+ When indexing arrays by \code{[} a single argument \code{i} can be a
+ matrix with as many columns as there are dimensions of \code{x}; the
+ result is then a vector with elements corresponding to the sets of
+ indices in each row of \code{i}.
+
+ An index value of \code{NULL} is treated as if it were \code{integer(0)}.
+ }
+
+\item{j}{See \code{\link[base]{Extract}}}
+
+\item{...}{See \code{\link[base]{Extract}}}
+
+\item{drop}{For matrices and arrays. If \code{TRUE} the result is
+ coerced to the lowest possible dimension (see the examples). This
+ only works for extracting elements, not for the replacement. See
+ \code{\link{drop}} for further details.
+ }
+}
+\description{
+See the documentation for the \code{\link[base]{Extract}} generic,
+defined in the R \code{\link[base]{base-package}}
+for the expected behavior.
+}
+\details{
+One special exception to standard behavior of these methods in phyloseq is that
+the \code{drop} argument is set internally to \code{FALSE}.
+This helps avoid bugs during complicated subsetting with multiple components,
+where it is necessary to be able to use a two dimensional indexing even
+if one of those dimensions has only 1 rank.
+Put another way, these phyloseq-defined extractions never collapse their result
+into a vector. See the documentation of \code{\link[base]{Extract}} for
+more information about the \code{drop} argument.
+}
+\examples{
+data(esophagus)
+nrow(otu_table(esophagus))
+nrow(otu_table(esophagus)[1:5, ])
+}
+\seealso{
+\code{\link[base]{Extract}}
+}
+
diff --git a/man/filter_taxa.Rd b/man/filter_taxa.Rd
new file mode 100644
index 0000000..84ed4c6
--- /dev/null
+++ b/man/filter_taxa.Rd
@@ -0,0 +1,55 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/transform_filter-methods.R
+\name{filter_taxa}
+\alias{filter_taxa}
+\title{Filter taxa based on across-sample OTU abundance criteria}
+\usage{
+filter_taxa(physeq, flist, prune=FALSE)
+}
+\arguments{
+\item{physeq}{(Required). A \code{\link{phyloseq-class}} object that you
+want to trim/filter.}
+
+\item{flist}{(Required). A function or list of functions that take a vector
+of abundance values and return a logical. Some canned useful function types
+are included in the \code{genefilter}-package.}
+
+\item{prune}{(Optional). A logical. Default \code{FALSE}. If \code{TRUE}, then
+the function returns the pruned \code{\link{phyloseq-class}} object, rather
+than the logical vector of taxa that passed the filter.}
+}
+\value{
+A logical vector equal to the number of taxa in \code{physeq}.
+ This can be provided directly to \code{\link{prune_taxa}} as first argument.
+ Alternatively, if \code{prune==TRUE}, the pruned \code{\link{phyloseq-class}}
+ object is returned instead.
+}
+\description{
+This function is directly analogous to the
+\code{\link[genefilter]{genefilter}} function for microarray filtering,
+but is used for filtering OTUs from phyloseq objects.
+It applies an arbitrary set of functions ---
+as a function list, for instance, created by \code{\link[genefilter]{filterfun}} ---
+as across-sample criteria, one OTU at a time.
+It takes as input a phyloseq object,
+and returns a logical vector
+indicating whether or not each OTU passed the criteria.
+Alternatively, if the \code{"prune"} option is set to \code{FALSE},
+it returns the already-trimmed version of the phyloseq object.
+}
+\examples{
+ data("enterotype")
+ require("genefilter")
+ flist <- filterfun(kOverA(5, 2e-05))
+ ent.logi <- filter_taxa(enterotype, flist)
+ ent.trim <- filter_taxa(enterotype, flist, TRUE)
+ identical(ent.trim, prune_taxa(ent.logi, enterotype))
+ identical(sum(ent.logi), ntaxa(ent.trim))
+ filter_taxa(enterotype, flist, TRUE)
+}
+\seealso{
+\code{\link[genefilter]{filterfun}},
+\code{\link{genefilter_sample}},
+\code{\link{filterfun_sample}}
+}
+
diff --git a/man/filterfun_sample.Rd b/man/filterfun_sample.Rd
new file mode 100644
index 0000000..d5bbfd8
--- /dev/null
+++ b/man/filterfun_sample.Rd
@@ -0,0 +1,37 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/transform_filter-methods.R
+\name{filterfun_sample}
+\alias{filterfun_sample}
+\title{A sample-wise filter function builder
+analogous to \code{\link[genefilter]{filterfun}}.}
+\usage{
+filterfun_sample(...)
+}
+\arguments{
+\item{...}{A comma-separated list of functions.}
+}
+\value{
+An enclosure (function) that itself will return a logical vector,
+ according to the
+ functions provided in the argument list, evaluated in order. The output of
+ filterfun_sample is appropriate for the `flist' argument to the
+ genefilter_sample method.
+}
+\description{
+See the \code{\link[genefilter]{filterfun}}, from the Bioconductor repository,
+for a taxa-/gene-wise filter (and further examples).
+}
+\examples{
+# Use simulated abundance matrix
+set.seed(711)
+testOTU <- otu_table(matrix(sample(1:50, 25, replace=TRUE), 5, 5), taxa_are_rows=FALSE)
+f1 <- filterfun_sample(topk(2))
+wh1 <- genefilter_sample(testOTU, f1, A=2)
+wh2 <- c(TRUE, TRUE, TRUE, FALSE, FALSE)
+prune_taxa(wh1, testOTU)
+prune_taxa(wh2, testOTU)
+}
+\seealso{
+\code{\link[genefilter]{filterfun}}, \code{\link{genefilter_sample}}
+}
+
diff --git a/man/fix_phylo.Rd b/man/fix_phylo.Rd
new file mode 100644
index 0000000..a190889
--- /dev/null
+++ b/man/fix_phylo.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/phylo-class.R
+\docType{methods}
+\name{fix_phylo}
+\alias{fix_phylo}
+\alias{fix_phylo,phylo-method}
+\title{Method for fixing problems with phylo-class trees in phyloseq}
+\usage{
+fix_phylo(tree)
+
+\S4method{fix_phylo}{phylo}(tree)
+}
+\description{
+For now this only entails replacing each missing (\code{NA}) branch-length
+value with 0.0.
+}
+\keyword{internal}
+
diff --git a/man/gapstat_ord.Rd b/man/gapstat_ord.Rd
new file mode 100644
index 0000000..cd37e30
--- /dev/null
+++ b/man/gapstat_ord.Rd
@@ -0,0 +1,70 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ordination-methods.R
+\name{gapstat_ord}
+\alias{gapstat_ord}
+\title{Estimate the gap statistic on an ordination result}
+\usage{
+gapstat_ord(ord, axes = c(1:2), type = "sites", FUNcluster = function(x,
+ k) { list(cluster = pam(x, k, cluster.only = TRUE)) }, K.max = 8, ...)
+}
+\arguments{
+\item{ord}{(Required). An ordination object. The precise class can vary.
+Any ordination classes supported internally by the phyloseq package
+should work, ultimately by passing to the \code{\link[vegan]{scores}} function
+or its internal extensions in phyloseq.}
+
+\item{axes}{(Optional). The ordination axes that you want to include.}
+
+\item{type}{(Optional). One of \code{"sites"}
+(the vegan package label for samples) or
+\code{"species"} (the vegan package label for OTUs/taxa).
+Default is \code{"sites"}.}
+
+\item{FUNcluster}{(Optional). This is passed to \code{\link[cluster]{clusGap}}.
+The documentation is copied here for convenience:
+a function which accepts as first argument a (data) matrix like \code{x},
+second argument, say (the number of desired clusters) \code{k}, where \code{k >= 2},
+and returns a list with a component named (or shortened to) cluster
+which is a vector of length \code{n = nrow(x)} of integers in \code{1:k}
+determining the clustering or grouping of the \code{n} observations.
+The default value is the following function, which wraps
+partitioning around medoids, \code{\link[cluster]{pam}}:
+
+\code{function(x, k){list(cluster = pam(x, k, cluster.only=TRUE))}}
+
+Any function that has these input/output properties (performing a clustering)
+will suffice. The more appropriate the clustering method, the better chance
+your gap statistic results will be useful.}
+
+\item{K.max}{(Optional). A single positive integer value.
+It indicates the maximum number of clusters that will be considered.
+Value must be at least two.
+This is passed to \code{\link[cluster]{clusGap}}.}
+
+\item{...}{(Optional). Additional named parameters
+passed on to \code{\link[cluster]{clusGap}}.
+For example, the \code{method} argument provides for extensive options
+regarding the method by which the ``optimal'' number of clusters
+is computed from the gap statistics (and their standard deviations).
+See the \code{\link[cluster]{clusGap}} documentation for more details.}
+}
+\value{
+An object of S3 class \code{"clusGap"}, basically a list with components.
+See the \code{\link[cluster]{clusGap}} documentation for more details.
+}
+\description{
+This is a wrapper for the \code{\link[cluster]{clusGap}} function,
+expecting an ordination result as the main data argument.
+}
+\examples{
+data("soilrep")
+sord = ordinate(soilrep, "PCoA", "bray")
+# Evaluate axes with scree plot
+plot_scree(sord)
+# Gap Statistic
+gs = gapstat_ord(sord, axes=1:3, verbose=FALSE)
+# plot_ordination(soilrep, sord, color="Treatment")
+plot_clusgap(gs)
+print(gs, method="Tibs2001SEmax")
+}
+
diff --git a/man/genefilter_sample-methods.Rd b/man/genefilter_sample-methods.Rd
new file mode 100644
index 0000000..7627e11
--- /dev/null
+++ b/man/genefilter_sample-methods.Rd
@@ -0,0 +1,73 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/transform_filter-methods.R
+\docType{methods}
+\name{genefilter_sample}
+\alias{genefilter_sample}
+\alias{genefilter_sample,matrix-method}
+\alias{genefilter_sample,otu_table-method}
+\alias{genefilter_sample,phyloseq-method}
+\title{Filter OTUs with arbitrary function, sample-wise.}
+\usage{
+genefilter_sample(X, flist, A=1)
+
+\S4method{genefilter_sample}{matrix}(X, flist, A = 1)
+
+\S4method{genefilter_sample}{otu_table}(X, flist, A = 1)
+
+\S4method{genefilter_sample}{phyloseq}(X, flist, A = 1)
+}
+\arguments{
+\item{X}{The object that needs trimming. Can be matrix, otu_table, or higher-
+order phyloseq classes that contain an otu_table.}
+
+\item{flist}{An enclosure object, typically created with \code{\link{filterfun_sample}}}
+
+\item{A}{An integer. The number of samples in which a taxa / OTUs passed the filter
+for it to be labeled TRUE in the output logical vector.}
+}
+\value{
+A logical vector with names equal to taxa_names (or rownames, if matrix).
+}
+\description{
+A general OTU trimming function for selecting OTUs that satisfy
+some criteria within the distribution of each sample, and then
+also an additional criteria for number of samples that must pass.
+This is a genefilter-like function that only considers sample-wise
+criteria. The number of acceptable samples is used
+as the final criteria (set by the argument \code{A})
+to determine whether or not the taxa should
+be retained (\code{TRUE}) or not (\code{FALSE}). Just like with genefilter, a
+logical having length equal to nrow()/\code{\link{ntaxa}} is returned, indicating which
+should be kept. This output can be provided
+directly to OTU trimming function, \code{\link{prune_taxa}}.
+By contrast, \code{\link[genefilter]{genefilter}},
+of the genefilter package in Bioconductor,
+works only on the rows of a matrix. Note that, because \code{\link{otu_table-class}}
+inherits directly from the \code{\link{matrix-class}}, an unmodified
+otu_table can be provided to \code{genefilter}, but be mindful of the orientation
+of the otu_table (use \code{\link{taxa_are_rows}}),
+and transpose (\code{\link[phyloseq]{t}}) if needed.
+}
+\examples{
+#
+## testOTU <- otu_table(matrix(sample(1:50, 25, replace=TRUE), 5, 5), taxa_are_rows=FALSE)
+## f1 <- filterfun_sample(topk(2))
+## wh1 <- genefilter_sample(testOTU, f1, A=2)
+## wh2 <- c(TRUE, TRUE, TRUE, FALSE, FALSE)
+## prune_taxa(wh1, testOTU)
+## prune_taxa(wh2, testOTU)
+##
+## tax_table1 <- tax_table(matrix("abc", 5, 5))
+## prune_taxa(wh1, tax_table1)
+## prune_taxa(wh2, tax_table1)
+}
+\seealso{
+\code{\link[genefilter]{genefilter}}, \code{\link{filterfun_sample}},
+ \code{\link[phyloseq]{t}},
+ \code{\link{prune_taxa}}
+}
+\keyword{OTU}
+\keyword{agglomerate}
+\keyword{cluster}
+\keyword{tree}
+
diff --git a/man/get.component.classes.Rd b/man/get.component.classes.Rd
new file mode 100644
index 0000000..0d2216c
--- /dev/null
+++ b/man/get.component.classes.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/phyloseq-class.R
+\name{get.component.classes}
+\alias{get.component.classes}
+\title{Show the component objects classes and slot names.}
+\usage{
+get.component.classes()
+}
+\value{
+a character vector of the component objects classes, where each
+element is named by the corresponding slot name in the phyloseq-class.
+}
+\description{
+There are no arguments to this function. It returns a named character
+when called, which can then be used for tests of component data types, etc.
+}
+\examples{
+#
+#get.component.classes()
+}
+\keyword{internal}
+
diff --git a/man/get_sample-methods.Rd b/man/get_sample-methods.Rd
new file mode 100644
index 0000000..4fd08de
--- /dev/null
+++ b/man/get_sample-methods.Rd
@@ -0,0 +1,40 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/almostAllAccessors.R
+\docType{methods}
+\name{get_sample}
+\alias{get_sample}
+\alias{get_sample,otu_table-method}
+\alias{get_sample,phyloseq-method}
+\title{Returns all abundance values for species \code{i}.}
+\usage{
+get_sample(physeq, i)
+
+\S4method{get_sample}{otu_table}(physeq, i)
+
+\S4method{get_sample}{phyloseq}(physeq, i)
+}
+\arguments{
+\item{physeq}{(Required). \code{\link{otu_table-class}}, or \code{\link{phyloseq-class}}.}
+
+\item{i}{(Required). A single taxa/species/OTU ID for which you want
+to know the abundance in each sample.}
+}
+\value{
+An integer vector of the abundance values for
+each sample in \code{physeq} for species \code{i}
+}
+\description{
+This is a simple accessor function for investigating
+a single species-of-interest.
+}
+\examples{
+data(esophagus)
+taxa_names(esophagus)
+get_sample(esophagus, "59_5_19")
+}
+\seealso{
+\code{\link{get_taxa}}
+ \code{\link{taxa_names}}
+ \code{\link{sample_names}}
+}
+
diff --git a/man/get_taxa-methods.Rd b/man/get_taxa-methods.Rd
new file mode 100644
index 0000000..40ce618
--- /dev/null
+++ b/man/get_taxa-methods.Rd
@@ -0,0 +1,41 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/almostAllAccessors.R
+\docType{methods}
+\name{get_taxa}
+\alias{get_taxa}
+\alias{get_taxa,otu_table-method}
+\alias{get_taxa,phyloseq-method}
+\title{Returns all abundance values of sample \code{i}.}
+\usage{
+get_taxa(physeq, i)
+
+\S4method{get_taxa}{otu_table}(physeq, i)
+
+\S4method{get_taxa}{phyloseq}(physeq, i)
+}
+\arguments{
+\item{physeq}{(Required). \code{\link{otu_table-class}}, or \code{\link{phyloseq-class}}.}
+
+\item{i}{(Required). A single sample for which you want
+to know the abundance of each species. Can be integer
+for index value, or sample name.}
+}
+\value{
+An integer vector of the abundance values for
+each species in \code{physeq} for sample \code{i}
+}
+\description{
+This is a simple accessor function for investigating
+a single sample-of-interest.
+}
+\examples{
+data(esophagus)
+sample_names(esophagus)
+get_taxa(esophagus, "B")
+}
+\seealso{
+\code{\link{get_sample}}
+ \code{\link{taxa_names}}
+ \code{\link{sample_names}}
+}
+
diff --git a/man/get_taxa_unique.Rd b/man/get_taxa_unique.Rd
new file mode 100644
index 0000000..d737ab6
--- /dev/null
+++ b/man/get_taxa_unique.Rd
@@ -0,0 +1,39 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/almostAllAccessors.R
+\name{get_taxa_unique}
+\alias{get_taxa_unique}
+\title{Get a unique vector of the observed taxa at a particular taxonomic rank}
+\usage{
+get_taxa_unique(physeq, taxonomic.rank=rank_names(physeq)[1], errorIfNULL=TRUE)
+}
+\arguments{
+\item{physeq}{(Required). \code{\link{taxonomyTable-class}}, or \code{\link{phyloseq-class}}.}
+
+\item{taxonomic.rank}{(Optional). Character. The taxonomic rank to use. Must select
+from the set indicated by \code{get_taxa_unique}. Default is
+to take the first column of the \code{taxonomyTable} component.}
+
+\item{errorIfNULL}{(Optional). Logical. Should the accessor stop with
+an error if the slot is empty (\code{NULL})? Default \code{TRUE}.}
+}
+\value{
+Character vector. Unique vector of the observed taxa
+ at a particular taxonomic rank
+}
+\description{
+This is a simple accessor function to make it more convenient to determine
+the different taxa present for a particular taxonomic rank
+in a given \code{\link{phyloseq-class}} object.
+}
+\examples{
+data(enterotype)
+get_taxa_unique(enterotype)
+data(GlobalPatterns)
+get_taxa_unique(GlobalPatterns, "Family")
+}
+\seealso{
+\code{\link{get_taxa}}
+ \code{\link{taxa_names}}
+ \code{\link{sample_names}}
+}
+
diff --git a/man/get_variable.Rd b/man/get_variable.Rd
new file mode 100644
index 0000000..33c5a53
--- /dev/null
+++ b/man/get_variable.Rd
@@ -0,0 +1,35 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/almostAllAccessors.R
+\name{get_variable}
+\alias{get_variable}
+\title{Get the values for a particular variable in sample_data}
+\usage{
+get_variable(physeq, varName)
+}
+\arguments{
+\item{physeq}{(Required). \code{\link{sample_data-class}}, or \code{\link{phyloseq-class}}.}
+
+\item{varName}{(Required). Character string of the variable name in \code{sample_data}.
+Use \code{sample_variables(physeq)} for available variables in your object.}
+}
+\value{
+Data. The clas of the data depends on what the contents of sample_data.
+}
+\description{
+This is a simple accessor function for streamlining access
+to values/vectors/factors/etc contained in the sample_data.
+}
+\examples{
+# Load the GlobalPatterns dataset into the workspace environment
+data(GlobalPatterns)
+# Look at the different values for SampleType
+get_variable(GlobalPatterns, "SampleType")
+}
+\seealso{
+\code{\link{get_taxa}}
+ \code{\link{taxa_names}}
+ \code{\link{sample_names}}
+
+ \code{\link{sample_variables}}
+}
+
diff --git a/man/getslots.phyloseq.Rd b/man/getslots.phyloseq.Rd
new file mode 100644
index 0000000..1358df2
--- /dev/null
+++ b/man/getslots.phyloseq.Rd
@@ -0,0 +1,34 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/phyloseq-class.R
+\name{getslots.phyloseq}
+\alias{getslots.phyloseq}
+\title{Return the non-empty slot names of a phyloseq object.}
+\usage{
+getslots.phyloseq(physeq)
+}
+\arguments{
+\item{physeq}{A \code{\link{phyloseq-class}} object. If \code{physeq} is a component
+data class, then just returns the class of \code{physeq}.}
+}
+\value{
+identical to getSlots. A named character vector of the slot classes
+of a particular S4 class, where each element is named by the slot name it
+represents. If \code{physeq} is a component data object,
+then a vector of length (1) is returned, named according to its slot name in
+the \code{\link{phyloseq-class}}.
+}
+\description{
+Like \code{\link{getSlots}}, but returns the class name if argument
+is component data object.
+}
+\examples{
+#
+ data(GlobalPatterns)
+ getslots.phyloseq(GlobalPatterns)
+ data(esophagus)
+ getslots.phyloseq(esophagus)
+}
+\seealso{
+merge_phyloseq
+}
+
diff --git a/man/import.Rd b/man/import.Rd
new file mode 100644
index 0000000..5fac64c
--- /dev/null
+++ b/man/import.Rd
@@ -0,0 +1,73 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/IO-methods.R
+\name{import}
+\alias{import}
+\title{Universal import method (wrapper) for phyloseq-package}
+\usage{
+import(pipelineName, ...)
+}
+\arguments{
+\item{pipelineName}{(Required). Character string. The name of the
+analysis tool / pipeline / package
+that created the OTU-cluster data or other data that you now want to import.
+Current options are \code{c("mothur", "pyrotagger", "QIIME", "RDP")}, and
+only the first letter is necessary.}
+
+\item{...}{(Required). Additional named arguments providing file paths, and possible
+other paramaters to the desired tool-specific import function.}
+}
+\value{
+In most cases a \code{\link{phyloseq-class}} will be returned, though
+ the included component data will vary by pipeline/tool, and also
+ by the types of data files provided.
+ The expected behavior is to return the most-comprehensive object possible,
+ given the provided arguments and pipeline/tool.
+}
+\description{
+A user must still understand the additional arguments required for each
+type of import data. Those arguments are described in detail at the
+tool-specific \code{import_*} links below. Each clustering tool / package / pipeline
+has its own idiosyncratic set of file names / types, and it remains the
+responsibility of the user to understand which file-path should be provided
+to each argument for the particular importing submethod. This method
+merely provides a central documentation and method-name, and the arguments
+are passed along as-is.
+}
+\examples{
+ ## See documentation of a specific import function
+}
+\references{
+BIOM: \url{http://www.biom-format.org/}
+
+mothur: \url{http://www.mothur.org/wiki/Main_Page}
+
+PyroTagger: \url{http://pyrotagger.jgi-psf.org/}
+
+QIIME: \url{http://qiime.org/}
+
+RDP pipeline: \url{http://pyro.cme.msu.edu/index.jsp}
+}
+\seealso{
+For BIOM format, see:
+\code{\link{import_biom}}
+
+For mothur, see:
+\code{\link{import_mothur}}
+
+Separate tools for mothur are also:
+\code{\link{show_mothur_cutoffs}}
+\code{\link{import_mothur_dist}}
+\code{\link{export_mothur_dist}}
+
+For PyroTagger, see:
+\code{\link{import_pyrotagger_tab}}
+
+For QIIME legacy format, see:
+\code{\link{import_qiime}}
+
+For RDP pipeline, see:
+\code{\link{import_RDP_cluster}}
+
+\code{\link{import_RDP_otu}}
+}
+
diff --git a/man/import_RDP_cluster.Rd b/man/import_RDP_cluster.Rd
new file mode 100644
index 0000000..58b3f1c
--- /dev/null
+++ b/man/import_RDP_cluster.Rd
@@ -0,0 +1,49 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/IO-methods.R
+\name{import_RDP_cluster}
+\alias{import_RDP_cluster}
+\title{Import RDP cluster file and return otu_table (abundance table).}
+\usage{
+import_RDP_cluster(RDP_cluster_file)
+}
+\arguments{
+\item{RDP_cluster_file}{A character string. The name of the \code{".clust"}
+file produced by the
+the complete linkage clustering step of the RDP pipeline.}
+}
+\value{
+An \code{\link{otu_table}} object parsed from the \code{".clust"} file.
+}
+\description{
+The RDP cluster pipeline (specifically, the output of the complete linkage clustering step)
+has no formal documentation for the \code{".clust"}
+file or its apparent sequence naming convention.
+}
+\details{
+\code{http://pyro.cme.msu.edu/index.jsp}
+
+The cluster file itself contains
+the names of all sequences contained in input alignment. If the upstream
+barcode and aligment processing steps are also done with the RDP pipeline,
+then the sequence names follow a predictable naming convention wherein each
+sequence is named by its sample and sequence ID, separated by a \code{"_"} as
+delimiter:
+
+\code{"sampleName_sequenceIDnumber"}
+
+This import function assumes that the sequence names in the cluster file follow
+this convention, and that the sample name does not contain any \code{"_"}. It
+is unlikely to work if this is not the case. It is likely to work if you used
+the upstream steps in the RDP pipeline to process your raw (barcoded, untrimmed)
+fasta/fastq data.
+
+This function first loops through the \code{".clust"} file and collects all
+of the sample names that appear. It secondly loops through each OTU (\code{"cluster"};
+each row of the cluster file) and sums the number of sequences (reads) from
+each sample. The resulting abundance table of OTU-by-sample is trivially
+coerced to an \code{\link{otu_table}} object, and returned.
+}
+\references{
+\url{http://pyro.cme.msu.edu/index.jsp}
+}
+
diff --git a/man/import_RDP_otu.Rd b/man/import_RDP_otu.Rd
new file mode 100644
index 0000000..4616ee8
--- /dev/null
+++ b/man/import_RDP_otu.Rd
@@ -0,0 +1,44 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/IO-methods.R
+\name{import_RDP_otu}
+\alias{import_RDP_otu}
+\title{Import new RDP OTU-table format}
+\usage{
+import_RDP_otu(otufile)
+}
+\arguments{
+\item{otufile}{(Optional).
+A character string indicating the file location of the OTU file,
+produced/exported according to the instructions above.}
+}
+\value{
+A \code{\link{otu_table-class}} object.
+}
+\description{
+Recently updated tools on RDP Pyro site make it easier to import Pyrosequencing output
+into R. The modified tool ``Cluster To R Formatter'' can take a cluster file
+(generated from RDP Clustering tools) to create a community data matrix file
+for distance cutoff range you are interested in. The resulting output file
+is a tab-delimited file containing the number of sequences for each sample
+for each OTU. The OTU header naming convention is \code{"OTU_"} followed by the OTU
+number in the cluster file. It pads ``0''s to make the OTU header easy to sort.
+The OTU numbers are not necessarily in order.
+}
+\examples{
+otufile <- system.file("extdata", "rformat_dist_0.03.txt.gz", package="phyloseq")
+### the gzipped file is automatically recognized, and read using R-connections
+ex_otu <- import_RDP_otu(otufile)
+class(ex_otu)
+ntaxa(ex_otu)
+nsamples(ex_otu)
+sample_sums(ex_otu)
+head(t(ex_otu))
+}
+\seealso{
+An alternative ``cluster'' file importer for RDP results:
+\code{\link{import_RDP_cluster}}
+
+The main RDP-pyrosequencing website
+\url{http://pyro.cme.msu.edu/index.jsp}
+}
+
diff --git a/man/import_biom.Rd b/man/import_biom.Rd
new file mode 100644
index 0000000..5450f44
--- /dev/null
+++ b/man/import_biom.Rd
@@ -0,0 +1,171 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/IO-methods.R
+\name{import_biom}
+\alias{import_biom}
+\title{Import phyloseq data from biom-format file}
+\usage{
+import_biom(BIOMfilename,
+ treefilename=NULL, refseqfilename=NULL, refseqFunction=readDNAStringSet, refseqArgs=NULL,
+ parseFunction=parse_taxonomy_default, parallel=FALSE, version=1.0, ...)
+}
+\arguments{
+\item{BIOMfilename}{(Required). A character string indicating the
+file location of the BIOM formatted file. This is a JSON formatted file,
+specific to biological datasets, as described in
+\url{http://www.qiime.org/svn_documentation/documentation/biom_format.html}{the biom-format home page}.
+In principle, this file should include you OTU abundance data (OTU table),
+your taxonomic classification data (taxonomy table), as well as your
+sample data, for instance what might be in your ``sample map'' in QIIME.
+A phylogenetic tree is not yet supported by biom-format, and so is a
+separate argument here. If, for some reason, your biom-format file is
+missing one of these mentioned data types but you have it in a separate file,
+you can first import the data that is in the biom file using this function,
+\code{import_biom}, and then ``merge'' the remaining data after you have
+imported with other tools using the relatively general-purpose data
+merging function called \code{\link{merge_phyloseq}}.}
+
+\item{treefilename}{(Optional). Default value is \code{NULL}.
+A file representing a phylogenetic tree
+or a \code{\link{phylo}} object.
+Files can be NEXUS or Newick format.
+See \code{\link{read_tree}} for more details.
+Also, if using a recent release of the GreenGenes database tree,
+try the \code{\link{read_tree_greengenes}} function --
+this should solve some issues specific to importing that tree.
+If provided, the tree should have the same OTUs/tip-labels
+as the OTUs in the other files.
+Any taxa or samples missing in one of the files is removed from all.
+As an example from the QIIME pipeline,
+this tree would be a tree of the representative 16S rRNA sequences from each OTU
+cluster, with the number of leaves/tips equal to the number of taxa/species/OTUs,
+or the complete reference database tree that contains the OTU identifiers
+of every OTU in your abundance table.
+Note that this argument can be a tree object (\code{\link[ape]{phylo}}-class)
+for cases where the tree has been --- or needs to be --- imported separately,
+as in the case of the GreenGenes tree mentioned earlier (code{\link{read_tree_greengenes}}).}
+
+\item{refseqfilename}{(Optional). Default \code{NULL}.
+The file path of the biological sequence file that contains at a minimum
+a sequence for each OTU in the dataset.
+Alternatively, you may provide an already-imported
+\code{\link[Biostrings]{XStringSet}} object that satisfies this condition.
+In either case, the \code{\link{names}} of each OTU need to match exactly the
+\code{\link{taxa_names}} of the other components of your data.
+If this is not the case, for example if the data file is a FASTA format but
+contains additional information after the OTU name in each sequence header,
+then some additional parsing is necessary,
+which you can either perform separately before calling this function,
+or describe explicitly in a custom function provided in the (next) argument,
+\code{refseqFunction}.
+Note that the \code{\link[Biostrings]{XStringSet}} class can represent any
+arbitrary sequence, including user-defined subclasses, but is most-often
+used to represent RNA, DNA, or amino acid sequences.
+The only constraint is that this special list of sequences
+has exactly one named element for each OTU in the dataset.}
+
+\item{refseqFunction}{(Optional).
+Default is \code{\link[Biostrings]{readDNAStringSet}},
+which expects to read a fasta-formatted DNA sequence file.
+If your reference sequences for each OTU are amino acid, RNA, or something else,
+then you will need to specify a different function here.
+This is the function used to read the file connection provided as the
+the previous argument, \code{refseqfilename}.
+This argument is ignored if \code{refseqfilename} is already a
+\code{\link[Biostrings]{XStringSet}} class.}
+
+\item{refseqArgs}{(Optional).
+Default \code{NULL}.
+Additional arguments to \code{refseqFunction}.
+See \code{\link[Biostrings]{XStringSet-io}} for details about
+additional arguments to the standard read functions in the Biostrings package.}
+
+\item{parseFunction}{(Optional). A function. It must be a function that
+takes as its first argument a character vector of taxonomic rank labels
+for a single OTU
+and parses and names each element
+(an optionally removes unwanted elements).
+Further details and examples of acceptable functions are provided
+in the documentation for \code{\link{parse_taxonomy_default}}.
+There are many variations on taxonomic nomenclature, and naming
+conventions used to store that information in various taxonomic
+databases and phylogenetic assignment algorithms. A popular database,
+\url{http://greengenes.lbl.gov/cgi-bin/nph-index.cgi}{greengenes},
+has its own custom parsing function provided in the phyloseq package,
+\code{\link{parse_taxonomy_greengenes}},
+and more can be contributed or posted as code snippets as needed.
+They can be custom-defined by a user immediately prior to the the call to
+\code{\link{import_biom}}, and this is a suggested first step to take
+when trouble-shooting taxonomy-related errors during file import.}
+
+\item{parallel}{(Optional). Logical. Wrapper option for \code{.parallel}
+ parameter in \code{plyr-package} functions. If \code{TRUE}, apply
+ parsing functions in parallel, using parallel backend provided by
+ \code{\link{foreach}} and its supporting backend packages. One caveat,
+ plyr-parallelization currently works most-cleanly with \code{multicore}-like
+ backends (Mac OS X, Unix?), and may throw warnings for SNOW-like backends.
+ See the example below for code invoking multicore-style backend within
+ the \code{doParallel} package.
+
+ Finally, for many datasets a parallel import should not be necessary
+ because a serial import will be just as fast and the import is often only
+ performed one time; after which the data should be saved as an RData file
+ using the \code{\link{save}} function.}
+
+\item{version}{(Optional). Numeric. The expected version number of the file.
+As the BIOM format evolves, version-specific importers may be available
+by adjusting the version value. Default is \code{1.0}.
+Not yet implemented. Parsing of the biom-format is done mostly
+by the biom package now available in CRAN.}
+
+\item{...}{Additional parameters passed on to \code{\link{read_tree}}.}
+}
+\value{
+A \code{\link{phyloseq-class}} object.
+}
+\description{
+New versions of QIIME produce a more-comprehensive and formally-defined
+JSON file format, called biom file format:
+}
+\details{
+``The biom file format (canonically pronounced `biome') is designed to be a
+general-use format for representing counts of observations in one or
+more biological samples. BIOM is a recognized standard for the Earth Microbiome
+Project and is a Genomics Standards Consortium candidate project.''
+
+\url{http://biom-format.org/}
+}
+\examples{
+# An included example of a rich dense biom file
+rich_dense_biom <- system.file("extdata", "rich_dense_otu_table.biom", package="phyloseq")
+import_biom(rich_dense_biom, parseFunction=parse_taxonomy_greengenes)
+# An included example of a sparse dense biom file
+rich_sparse_biom <- system.file("extdata", "rich_sparse_otu_table.biom", package="phyloseq")
+import_biom(rich_sparse_biom, parseFunction=parse_taxonomy_greengenes)
+# # # Example code for importing large file with parallel backend
+# library("doParallel")
+# registerDoParallel(cores=6)
+# import_biom("my/file/path/file.biom", parseFunction=parse_taxonomy_greengenes, parallel=TRUE)
+}
+\references{
+\href{http://www.qiime.org/svn_documentation/documentation/biom_format.html}{biom-format}
+}
+\seealso{
+\code{\link{import}}
+
+\code{\link{import_qiime}}
+
+\code{\link{read_tree}}
+
+\code{\link{read_tree_greengenes}}
+
+\code{\link[biomformat]{read_biom}}
+
+\code{\link[biomformat]{biom_data}}
+
+\code{\link[biomformat]{sample_metadata}}
+
+\code{\link[biomformat]{observation_metadata}}
+
+\code{\link[Biostrings]{XStringSet-io}}
+}
+
diff --git a/man/import_env_file.Rd b/man/import_env_file.Rd
new file mode 100644
index 0000000..d1075a7
--- /dev/null
+++ b/man/import_env_file.Rd
@@ -0,0 +1,42 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/IO-methods.R
+\name{import_env_file}
+\alias{import_env_file}
+\title{Read a UniFrac-formatted ENV file.}
+\usage{
+import_env_file(envfilename, tree=NULL, sep="\t", ...)
+}
+\arguments{
+\item{envfilename}{(Required). A charater string of the ENV filename (relative or absolute)}
+
+\item{tree}{(Optional). \code{\link{phylo-class}} object to be paired with
+the output otu_table.}
+
+\item{sep}{A character string indicating the delimiter used in the file.
+The default is \code{"\t"}.}
+
+\item{...}{Additional parameters passed on to \code{\link{read.table}}.}
+}
+\value{
+An \code{\link{otu_table-class}}, or \code{\link{phyloseq-class}} if
+ a \code{\link{phylo-class}} argument is provided to \code{tree}.
+}
+\description{
+Convenience wrapper function to read the environment-file, as formatted for
+input to the UniFrac server (\url{http://bmf2.colorado.edu/unifrac/}).
+The official format of these files is that
+each row specifies (in order) the sequence name, source sample, and (optionally)
+the number of times the sequence was observed.
+}
+\examples{
+# import_env_file(myEnvFile, myTree)
+}
+\references{
+\url{http://bmf2.colorado.edu/unifrac/}
+}
+\seealso{
+\code{\link{import}}
+
+\code{\link{tip_glom}}
+}
+
diff --git a/man/import_mothur.Rd b/man/import_mothur.Rd
new file mode 100644
index 0000000..17fd11d
--- /dev/null
+++ b/man/import_mothur.Rd
@@ -0,0 +1,121 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/IO-methods.R
+\name{import_mothur}
+\alias{import_mothur}
+\title{General function for importing mothur data files into phyloseq.}
+\usage{
+import_mothur(mothur_list_file = NULL, mothur_group_file = NULL,
+ mothur_tree_file = NULL, cutoff = NULL, mothur_shared_file = NULL,
+ mothur_constaxonomy_file = NULL, parseFunction = parse_taxonomy_default)
+}
+\arguments{
+\item{mothur_list_file}{(Optional). The list file name / location produced by \emph{mothur}.}
+
+\item{mothur_group_file}{(Optional). The name/location of the group file produced
+by \emph{mothur}'s \code{make.group()} function. It contains information
+about the sample source of individual sequences, necessary for creating a
+species/taxa abundance table (\code{otu_table}). See
+\code{http://www.mothur.org/wiki/Make.group}}
+
+\item{mothur_tree_file}{(Optional).
+A tree file, presumably produced by \emph{mothur},
+and readable by \code{\link{read_tree}}.
+The file probably has extension \code{".tree"}.}
+
+\item{cutoff}{(Optional). A character string indicating the cutoff value, (or \code{"unique"}),
+that matches one of the cutoff-values used to produce the OTU clustering
+results contained within the list-file created by \emph{mothur} (and specified
+by the \code{mothur_list_file} argument). The default
+is to take the largest value among the cutoff values contained in the list
+file. If only one cutoff is included in the file, it is taken and this
+argument does not need to be specified. Note that the \code{cluster()}
+function within the \emph{mothur} package will often produce a list file
+with multiple cutoff values, even if a specific cutoff is specified. It is
+suggested that you check which cutoff values are available in a given list
+file using the \code{\link{show_mothur_cutoffs}} function.}
+
+\item{mothur_shared_file}{(Optional). A
+\href{http://www.mothur.org/wiki/Shared_file}{shared file}
+produced by \emph{mothur}.}
+
+\item{mothur_constaxonomy_file}{(Optional). A
+\href{http://www.mothur.org/wiki/Constaxonomy_file}{consensus taxonomy file}
+produced by \emph{mothur}.}
+
+\item{parseFunction}{(Optional). A specific function used for parsing the taxonomy string.
+See \code{\link{parse_taxonomy_default}} for an example. If the default is
+used, this function expects a semi-colon delimited taxonomy string, with
+no additional rank specifier. A common taxonomic database is GreenGenes,
+and in recent versions its taxonomy entries include a prefix, which is best cleaved
+and used to precisely label the ranks (\code{\link{parse_taxonomy_greengenes}}).}
+}
+\value{
+The object class depends on the provided arguments.
+ A phyloseq object is returned if enough data types are provided.
+ If only one data component can be created from the data, it is returned.
+
+ FASTER (recommended for larger data sizes):
+
+ If only a \code{mothur_constaxonomy_file} is provided,
+ then a \code{\link{taxonomyTable-class}} object is returned.
+
+ If only a \code{mothur_shared_file} is provided,
+ then an \code{\link{otu_table}} object is returned.
+
+ SLOWER (but fine for small file sizes):
+
+ The list and group file formats are extremely inefficient for large datasets,
+ and they are not recommended. The mothur software provides tools for
+ converting to other file formats, such as a so-called ``shared'' file.
+ You should provide a shared file, or group/list files, but not
+ both at the same time.
+ If only a list and group file are provided,
+ then an \code{otu_table} object is returned.
+ Similarly, if only a list and tree file are provided,
+ then only a tree is returned (\code{\link[ape]{phylo}}-class).
+}
+\description{
+Technically all parameters are optional,
+but if you don't provide any file connections, then nothing will be returned.
+While the \code{list} and \code{group} files are the first two arguments
+for legacy-compatibility reasons, we don't recommend that you use these
+file types with modern (large) datasets. They are comically inefficient, as
+they store the name of every sequencing read in both files. The \emph{mothur}
+package provides conversions utilities to create other more-efficient formats,
+which we recommend, like
+the \href{http://www.mothur.org/wiki/Shared_file}{shared file} for an OTU table.
+Alternatively, mothur also provides a utility to create a biom-format file
+that is independent of OTU clustering platform. Biom-format files
+should be imported not with this function, but with \code{\link{import_biom}}.
+The resulting objects after import should be \code{\link{identical}} in R.
+}
+\examples{
+# # The following example assumes you have downloaded the esophagus example
+# # dataset from the mothur wiki:
+# # "http://www.mothur.org/wiki/Esophageal_community_analysis"
+# # "http://www.mothur.org/w/images/5/55/Esophagus.zip"
+# # The path on your machine may (probably will) vary
+# mothur_list_file <- "~/Downloads/mothur/Esophagus/esophagus.an.list"
+# mothur_group_file <- "~/Downloads/mothur/Esophagus/esophagus.good.groups"
+# mothur_tree_file <- "~/Downloads/mothur/Esophagus/esophagus.tree"
+# # # Actual examples follow:
+# show_mothur_cutoffs(mothur_list_file)
+# test1 <- import_mothur(mothur_list_file, mothur_group_file, mothur_tree_file)
+# test2 <- import_mothur(mothur_list_file, mothur_group_file, mothur_tree_file, cutoff="0.02")
+# # Returns just a tree
+# import_mothur(mothur_list_file, mothur_tree_file=mothur_tree_file)
+# # Returns just an otu_table
+# import_mothur(mothur_list_file, mothur_group_file=mothur_group_file)
+# # Returns an error
+# import_mothur(mothur_list_file)
+# # Should return an "OMG, you must provide the list file" error
+# import_mothur()
+}
+\references{
+\url{http://www.mothur.org/wiki/Main_Page}
+
+Schloss, P.D., et al., Introducing mothur: Open-source, platform-independent,
+community-supported software for describing and comparing microbial communities.
+Appl Environ Microbiol, 2009. 75(23):7537-41.
+}
+
diff --git a/man/import_mothur_constaxonomy.Rd b/man/import_mothur_constaxonomy.Rd
new file mode 100644
index 0000000..07a3668
--- /dev/null
+++ b/man/import_mothur_constaxonomy.Rd
@@ -0,0 +1,36 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/IO-methods.R
+\name{import_mothur_constaxonomy}
+\alias{import_mothur_constaxonomy}
+\title{Import mothur constaxonomy file and return a taxonomyTable}
+\usage{
+import_mothur_constaxonomy(mothur_constaxonomy_file,
+ parseFunction = parse_taxonomy_default)
+}
+\arguments{
+\item{mothur_constaxonomy_file}{(Required). A
+\href{http://www.mothur.org/wiki/Constaxonomy_file}{consensus taxonomy file}
+produced by \emph{mothur}.}
+
+\item{parseFunction}{(Optional). A specific function used for parsing the taxonomy string.
+See \code{\link{parse_taxonomy_default}} for an example. If the default is
+used, this function expects a semi-colon delimited taxonomy string, with
+no additional rank specifier. A common taxonomic database is GreenGenes,
+and for recent versions its taxonomy includes a prefix, which is best cleaved
+and used to precisely label the ranks (\code{\link{parse_taxonomy_greengenes}}).}
+}
+\value{
+An \code{\link{taxonomyTable-class}} object.
+}
+\description{
+Import mothur constaxonomy file and return a taxonomyTable
+}
+\seealso{
+\code{\link{import_mothur}}
+
+\code{\link{tax_table}}
+
+\code{\link{phyloseq}}
+}
+\keyword{internal}
+
diff --git a/man/import_mothur_dist.Rd b/man/import_mothur_dist.Rd
new file mode 100644
index 0000000..9956171
--- /dev/null
+++ b/man/import_mothur_dist.Rd
@@ -0,0 +1,32 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/IO-methods.R
+\name{import_mothur_dist}
+\alias{import_mothur_dist}
+\title{Import mothur-formatted distance file}
+\usage{
+import_mothur_dist(mothur_dist_file)
+}
+\arguments{
+\item{mothur_dist_file}{Required. The distance file name / location produced by \emph{mothur}.}
+}
+\value{
+A distance matrix object describing all sequences in a dataset.
+}
+\description{
+The mothur application will produce a file containing the pairwise distances
+between all sequences in a dataset. This distance matrix can be the basis for
+OTU cluster designations. R also has many built-in or off-the-shelf tools for
+dealing with distance matrices.
+}
+\examples{
+# # Take a look at the dataset shown here as an example:
+# # "http://www.mothur.org/wiki/Esophageal_community_analysis"
+# # find the file ending with extension ".dist", download to your system
+# # The location of your file may vary
+# mothur_dist_file <- "~/Downloads/mothur/Esophagus/esophagus.dist"
+# myNewDistObject <- import_mothur_dist(mothur_dist_file)
+}
+\seealso{
+\code{\link{import_mothur}}
+}
+
diff --git a/man/import_mothur_groups.Rd b/man/import_mothur_groups.Rd
new file mode 100644
index 0000000..381750b
--- /dev/null
+++ b/man/import_mothur_groups.Rd
@@ -0,0 +1,32 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/IO-methods.R
+\name{import_mothur_groups}
+\alias{import_mothur_groups}
+\title{Parse mothur group file into a simple hash table.}
+\usage{
+import_mothur_groups(mothur_group_file)
+}
+\arguments{
+\item{mothur_group_file}{A character string indicating the location of the
+\emph{mothur}-produced group file in which the sample-source of each sequence
+is recorded. See
+\code{http://www.mothur.org/wiki/Make.group}}
+}
+\value{
+A data.frame that is effectively a hash table between sequence names
+ and their sample source.
+}
+\description{
+The data.frame object
+returned by this function is not immediately useable by other \emph{phyloseq}
+functions, and must be first parsed in conjunction with a separate \emph{mothur}
+\code{"list"} file. This function is made accessible to \emph{phyloseq} users
+for troubleshooting and inspection, but the \code{link{import_mothur()}} function
+is suggested if the goal is to import the OTU clustering results from \emph{mothur}
+into \emph{phyloseq}. You will need both a group file and a list file for that end.
+}
+\seealso{
+\code{\link{import_mothur}}
+}
+\keyword{internal}
+
diff --git a/man/import_mothur_otu_table.Rd b/man/import_mothur_otu_table.Rd
new file mode 100644
index 0000000..c02b67b
--- /dev/null
+++ b/man/import_mothur_otu_table.Rd
@@ -0,0 +1,41 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/IO-methods.R
+\name{import_mothur_otu_table}
+\alias{import_mothur_otu_table}
+\title{Import mothur list and group files and return an otu_table}
+\usage{
+import_mothur_otu_table(mothur_list_file, mothur_group_file, cutoff=NULL)
+}
+\arguments{
+\item{mothur_list_file}{The list file name and/or location as produced by \emph{mothur}.}
+
+\item{mothur_group_file}{The name/location of the group file produced
+by \emph{mothur}'s \code{make.group()} function. It contains information
+about the sample source of individual sequences, necessary for creating a
+species/taxa abundance table (\code{otu_table}). See
+\code{http://www.mothur.org/wiki/Make.group}}
+
+\item{cutoff}{A character string indicating the cutoff value, (or \code{"unique"}),
+that matches one of the cutoff-values used to produce the OTU clustering
+results contained within the list-file created by \emph{mothur} (and specified
+by the \code{mothur_list_file} argument).
+The default
+is to take the largest value among the cutoff values contained in the list
+file. If only one cutoff is included in the file, it is taken and this
+argument does not need to be specified. Note that the \code{cluster()}
+function within the \emph{mothur} package will often produce a list file
+with multiple cutoff values, even if a specific cutoff is specified. It is
+suggested that you check which cutoff values are available in a given list
+file using the \code{\link{show_mothur_cutoffs}} function.}
+}
+\value{
+An \code{\link{otu_table}} object.
+}
+\description{
+Import mothur list and group files and return an otu_table
+}
+\seealso{
+\code{\link{import_mothur}}
+}
+\keyword{internal}
+
diff --git a/man/import_mothur_otulist.Rd b/man/import_mothur_otulist.Rd
new file mode 100644
index 0000000..872ef72
--- /dev/null
+++ b/man/import_mothur_otulist.Rd
@@ -0,0 +1,43 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/IO-methods.R
+\name{import_mothur_otulist}
+\alias{import_mothur_otulist}
+\title{Import mothur list file and return as list object in R.}
+\usage{
+import_mothur_otulist(mothur_list_file, cutoff=NULL)
+}
+\arguments{
+\item{mothur_list_file}{The list file name and/or location as produced by \emph{mothur}.}
+
+\item{cutoff}{A character string indicating the cutoff value, (or \code{"unique"}),
+that matches one of the cutoff-values used to produce the OTU clustering
+results contained within the list-file created by \emph{mothur}. The default
+is to take the largest value among the cutoff values contained in the list
+file. If only one cutoff is included in the file, it is taken and this
+argument does not need to be specified. Note that the \code{cluster()}
+function within the \emph{mothur} package will often produce a list file
+with multiple cutoff values, even if a specific cutoff is specified. It is
+suggested that you check which cutoff values are available in a given list
+file using the \code{\link{show_mothur_cutoffs}} function.}
+}
+\value{
+A list, where each element is a character vector of 1 or more
+ sequence identifiers, indicating how each sequence from the original data
+ is clustered into OTUs by \emph{mothur}. Note that in some cases this is highly
+ dependent on the choice for \code{cutoff}.
+}
+\description{
+This is a user-available module of a more comprehensive function for importing
+OTU clustering/abundance data using the \emph{mothur} package. The list object
+returned by this function is not immediately useable by other \emph{phyloseq}
+functions, and must be first parsed in conjunction with a separate \emph{mothur}
+\code{"group"} file. This function is made accessible to \emph{phyloseq} users
+for troubleshooting and inspection, but the \code{link{import_mothur()}} function
+is suggested if the goal is to import the OTU clustering results from \emph{mothur}
+into \emph{phyloseq}.
+}
+\seealso{
+\code{\link{show_mothur_cutoffs}}, \code{\link{import_mothur}}
+}
+\keyword{internal}
+
diff --git a/man/import_mothur_shared.Rd b/man/import_mothur_shared.Rd
new file mode 100644
index 0000000..6046c34
--- /dev/null
+++ b/man/import_mothur_shared.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/IO-methods.R
+\name{import_mothur_shared}
+\alias{import_mothur_shared}
+\title{Import mothur shared file and return an otu_table}
+\usage{
+import_mothur_shared(mothur_shared_file, cutoff = NULL)
+}
+\arguments{
+\item{mothur_shared_file}{(Required). A
+\href{http://www.mothur.org/wiki/Shared_file}{shared file}
+produced by \emph{mothur}.}
+}
+\value{
+An \code{\link{otu_table}} object.
+}
+\description{
+Import mothur shared file and return an otu_table
+}
+\seealso{
+\code{\link{import_mothur}}
+}
+\keyword{internal}
+
diff --git a/man/import_pyrotagger_tab.Rd b/man/import_pyrotagger_tab.Rd
new file mode 100644
index 0000000..5c50a0e
--- /dev/null
+++ b/man/import_pyrotagger_tab.Rd
@@ -0,0 +1,67 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/IO-methods.R
+\name{import_pyrotagger_tab}
+\alias{import_pyrotagger_tab}
+\title{Imports a tab-delimited version of the pyrotagger output file.}
+\usage{
+import_pyrotagger_tab(pyrotagger_tab_file,
+strict_taxonomy=FALSE, keep_potential_chimeras=FALSE)
+}
+\arguments{
+\item{pyrotagger_tab_file}{(Required). A character string. The name of the tab-delimited
+pyrotagger output table.}
+
+\item{strict_taxonomy}{(Optional). Logical. Default \code{FALSE}. Should the taxonomyTable
+component be limited to just taxonomic data? Default includes all fields from
+the pyrotagger file.}
+
+\item{keep_potential_chimeras}{(Optional). Logical. Default \code{FALSE}. The
+pyrotagger output also includes OTUs that are tagged by pyrotagger as likely
+chimeras. These putative chimeric OTUs can be retained if set to \code{TRUE}.
+The putative chimeras are excluded by default.}
+}
+\value{
+An \code{otuTax} object containing both the otu_table and TaxonomyTable data
+ components, parsed from the pyrotagger output.
+}
+\description{
+PyroTagger is a web-server that takes raw, barcoded 16S rRNA amplicon sequences
+and returns an excel spreadsheet (\code{".xls"}) with both abundance and
+taxonomy data. It also includes some confidence information related to the
+taxonomic assignment.
+}
+\details{
+PyroTagger is created and maintained by the Joint Genome Institute
+at \code{"http://pyrotagger.jgi-psf.org/"}
+
+The typical output form PyroTagger is a spreadsheet format \code{".xls"}, which poses
+additional import challenges. However, virtually all spreadsheet applications
+support the \code{".xls"} format, and can further export this file in a
+tab-delimited format. It is recommended that you convert the xls-file without
+any modification (as tempting as it might be once you have loaded it) into a
+tab-delimited text file. Deselect any options to encapsulate fields in quotes,
+as extra quotes around each cell's contents might cause problems during
+file processing. These quotes will also inflate the file-size, so leave them out
+as much as possible, while also resisting any temptation to modify the xls-file
+``by hand''.
+
+A highly-functional and free spreadsheet application can be obtained as part
+of the cross-platform \code{OpenOffice} suite. It works for the above
+required conversion. Go to \code{"http://www.openoffice.org/"}.
+
+It is regrettable that this importer does not take the xls-file directly
+as input. However, because of the moving-target nature of spreadsheet
+file formats, there is limited support for direct import of these formats into
+\code{R}. Rather than add to the dependency requirements of emph{phyloseq}
+and the relative support of these xls-support packages, it seems more efficient
+to choose an arbitrary delimited text format, and focus on the data
+structure in the PyroTagger output. This will be easier to support in the
+long-run.
+}
+\examples{
+## New_otuTaxObject <- import_pyrotagger_tab(pyrotagger_tab_file)
+}
+\references{
+\url{http://pyrotagger.jgi-psf.org/}
+}
+
diff --git a/man/import_qiime.Rd b/man/import_qiime.Rd
new file mode 100644
index 0000000..5dd1032
--- /dev/null
+++ b/man/import_qiime.Rd
@@ -0,0 +1,156 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/IO-methods.R
+\name{import_qiime}
+\alias{import_qiime}
+\title{Import function to read the now legacy-format QIIME OTU table.}
+\usage{
+import_qiime(otufilename = NULL, mapfilename = NULL, treefilename = NULL,
+ refseqfilename = NULL, refseqFunction = readDNAStringSet,
+ refseqArgs = NULL, parseFunction = parse_taxonomy_qiime, verbose = TRUE,
+ ...)
+}
+\arguments{
+\item{otufilename}{(Optional). A character string indicating
+the file location of the OTU file.
+The combined OTU abundance and taxonomic identification file,
+tab-delimited, as produced by QIIME under default output settings.
+Default value is \code{NULL}.}
+
+\item{mapfilename}{(Optional). The QIIME map file is required
+for processing barcoded primers in QIIME
+as well as some of the post-clustering analysis. This is a required
+input file for running QIIME. Its strict formatting specification should be
+followed for correct parsing by this function.
+Default value is \code{NULL}.}
+
+\item{treefilename}{(Optional). Default value is \code{NULL}.
+A file representing a phylogenetic tree
+or a \code{\link{phylo}} object.
+Files can be NEXUS or Newick format.
+See \code{\link{read_tree}} for more details.
+Also, if using a recent release of the GreenGenes database tree,
+try the \code{\link{read_tree_greengenes}} function --
+this should solve some issues specific to importing that tree.
+If provided, the tree should have the same OTUs/tip-labels
+as the OTUs in the other files.
+Any taxa or samples missing in one of the files is removed from all.
+As an example from the QIIME pipeline,
+this tree would be a tree of the representative 16S rRNA sequences from each OTU
+cluster, with the number of leaves/tips equal to the number of taxa/species/OTUs,
+or the complete reference database tree that contains the OTU identifiers
+of every OTU in your abundance table.
+Note that this argument can be a tree object (\code{\link[ape]{phylo}}-class)
+for cases where the tree has been --- or needs to be --- imported separately,
+as in the case of the GreenGenes tree mentioned earlier (code{\link{read_tree_greengenes}}).}
+
+\item{refseqfilename}{(Optional). Default \code{NULL}.
+The file path of the biological sequence file that contains at a minimum
+a sequence for each OTU in the dataset.
+Alternatively, you may provide an already-imported
+\code{\link[Biostrings]{XStringSet}} object that satisfies this condition.
+In either case, the \code{\link{names}} of each OTU need to match exactly the
+\code{\link{taxa_names}} of the other components of your data.
+If this is not the case, for example if the data file is a FASTA format but
+contains additional information after the OTU name in each sequence header,
+then some additional parsing is necessary,
+which you can either perform separately before calling this function,
+or describe explicitly in a custom function provided in the (next) argument,
+\code{refseqFunction}.
+Note that the \code{\link[Biostrings]{XStringSet}} class can represent any
+arbitrary sequence, including user-defined subclasses, but is most-often
+used to represent RNA, DNA, or amino acid sequences.
+The only constraint is that this special list of sequences
+has exactly one named element for each OTU in the dataset.}
+
+\item{refseqFunction}{(Optional).
+Default is \code{\link[Biostrings]{readDNAStringSet}},
+which expects to read a fasta-formatted DNA sequence file.
+If your reference sequences for each OTU are amino acid, RNA, or something else,
+then you will need to specify a different function here.
+This is the function used to read the file connection provided as the
+the previous argument, \code{refseqfilename}.
+This argument is ignored if \code{refseqfilename} is already a
+\code{\link[Biostrings]{XStringSet}} class.}
+
+\item{refseqArgs}{(Optional).
+Default \code{NULL}.
+Additional arguments to \code{refseqFunction}.
+See \code{\link[Biostrings]{XStringSet-io}} for details about
+additional arguments to the standard read functions in the Biostrings package.}
+
+\item{parseFunction}{(Optional). An optional custom function for parsing the
+character string that contains the taxonomic assignment of each OTU.
+The default parsing function is \code{\link{parse_taxonomy_qiime}},
+specialized for splitting the \code{";"}-delimited strings and also
+attempting to interpret greengenes prefixes, if any, as that is a common
+format of the taxonomy string produced by QIIME.}
+
+\item{verbose}{(Optional). A \code{\link{logical}}.
+Default is \code{TRUE}.
+Should progresss messages
+be \code{\link{cat}}ted to standard out?}
+
+\item{...}{Additional arguments passed to \code{\link{read_tree}}}
+}
+\value{
+A \code{\link{phyloseq-class}} object.
+}
+\description{
+QIIME produces several files that can be directly imported by
+the \code{\link{phyloseq-package}}.
+Originally, QIIME produced its own custom format table
+that contained both OTU-abundance
+and taxonomic identity information.
+This function is still included in phyloseq mainly to accommodate these
+now-outdated files. Recent versions of QIIME store output in the
+biom-format, an emerging file format standard for microbiome data.
+If your data is in the biom-format, if it ends with a \code{.biom}
+file name extension, then you should use the \code{\link{import_biom}}
+function instead.
+}
+\details{
+Other related files include
+the mapping-file that typically stores sample covariates,
+converted naturally to the
+\code{\link{sample_data-class}} component data type in the phyloseq-package.
+QIIME may also produce a
+phylogenetic tree with a tip for each OTU, which can also be imported
+specified here or imported separately using \code{\link{read_tree}}.
+
+See \url{"http://www.qiime.org/"} for details on using QIIME. While there are
+many complex dependencies, QIIME can be downloaded as a pre-installed
+linux virtual machine that runs ``off the shelf''.
+
+The different files useful for import to \emph{phyloseq} are not collocated in
+a typical run of the QIIME pipeline. See the main \emph{phyloseq} vignette for an
+example of where ot find the relevant files in the output directory.
+}
+\examples{
+ otufile <- system.file("extdata", "GP_otu_table_rand_short.txt.gz", package="phyloseq")
+ mapfile <- system.file("extdata", "master_map.txt", package="phyloseq")
+ trefile <- system.file("extdata", "GP_tree_rand_short.newick.gz", package="phyloseq")
+ import_qiime(otufile, mapfile, trefile)
+}
+\references{
+\url{http://qiime.org/}
+
+``QIIME allows analysis of high-throughput community sequencing data.''
+J Gregory Caporaso, Justin Kuczynski, Jesse Stombaugh, Kyle Bittinger, Frederic D Bushman,
+Elizabeth K Costello, Noah Fierer, Antonio Gonzalez Pena, Julia K Goodrich, Jeffrey I Gordon,
+Gavin A Huttley, Scott T Kelley, Dan Knights, Jeremy E Koenig, Ruth E Ley,
+Catherine A Lozupone, Daniel McDonald, Brian D Muegge, Meg Pirrung, Jens Reeder, Joel R Sevinsky,
+Peter J Turnbaugh, William A Walters, Jeremy Widmann, Tanya Yatsunenko, Jesse Zaneveld and Rob Knight;
+Nature Methods, 2010; doi:10.1038/nmeth.f.303
+}
+\seealso{
+\code{\link{phyloseq}}
+
+\code{\link{merge_phyloseq}}
+
+\code{\link{read_tree}}
+
+\code{\link{read_tree_greengenes}}
+
+\code{\link[Biostrings]{XStringSet-io}}
+}
+
diff --git a/man/import_qiime_otu_tax.Rd b/man/import_qiime_otu_tax.Rd
new file mode 100644
index 0000000..3491b29
--- /dev/null
+++ b/man/import_qiime_otu_tax.Rd
@@ -0,0 +1,83 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/IO-methods.R
+\name{import_qiime_otu_tax}
+\alias{import_qiime_otu_tax}
+\title{Import now legacy-format QIIME OTU table as a list of two matrices.}
+\usage{
+import_qiime_otu_tax(file, parseFunction = parse_taxonomy_qiime,
+ verbose = TRUE, parallel = FALSE)
+}
+\arguments{
+\item{file}{(Required). The path to the qiime-formatted file you want to
+import into R. Can be compressed (e.g. \code{.gz}, etc.), though the
+details may be OS-specific. That is, Windows-beware.}
+
+\item{parseFunction}{(Optional). An optional custom function for parsing the
+character string that contains the taxonomic assignment of each OTU.
+The default parsing function is \code{\link{parse_taxonomy_qiime}},
+specialized for splitting the \code{";"}-delimited strings and also
+attempting to interpret greengenes prefixes, if any, as that is a common
+format of the taxonomy string produced by QIIME.}
+
+\item{verbose}{(Optional). A \code{\link{logical}}.
+Default is \code{TRUE}.
+Should progresss messages
+be \code{\link{cat}}ted to standard out?}
+
+\item{parallel}{(Optional). Logical. Should the parsing be performed in
+parallel?. Default is \code{FALSE}. Only a few steps are actually
+parallelized, and for most datasets it will actually be faster and
+more efficient to keep this set to \code{FALSE}.
+Also, to get any benefit at all, you will need to register a
+parallel ``backend'' through one of the backend packages supported
+by the \code{\link{foreach-package}}.}
+}
+\value{
+A list of two matrices. \code{$otutab} contains the OTU Table
+ as a numeric matrix, while \code{$taxtab} contains a character matrix
+ of the taxonomy assignments.
+}
+\description{
+Now a legacy-format, older versions of QIIME
+produced an OTU file that typically contains both OTU-abundance
+and taxonomic identity information in a tab-delimted table.
+If your file ends with the extension \code{.biom}, or if you happen to know
+that it is a biom-format file, or if you used default settings in a version
+of QIIME of \code{1.7} or greater,
+then YOU SHOULD USE THE BIOM-IMPORT FUNCTION instead,
+\code{\link{import_biom}}.
+}
+\details{
+This function uses chunking to perform both the reading and parsing in blocks
+of optional size,
+thus constrain the peak memory usage.
+feature should make this
+importer accessible to machines with modest memory,
+but with the caveat that
+the full numeric matrix must be a manageable size at the end, too.
+In principle, the final tables will be large, but much more efficiently represented than
+the character-stored numbers.
+If total memory for storing the numeric matrix becomes problematic,
+a switch to a sparse matrix representation of the abundance
+-- which is typically well-suited to this data -- might provide a solution.
+}
+\examples{
+ otufile <- system.file("extdata", "GP_otu_table_rand_short.txt.gz", package="phyloseq")
+ import_qiime_otu_tax(otufile)
+}
+\seealso{
+\code{\link{import}}
+
+\code{\link{merge_phyloseq}}
+
+\code{\link{phyloseq}}
+
+\code{\link{import_qiime}}
+
+\code{\link{read_tree}}
+
+\code{\link{read_tree_greengenes}}
+
+\code{\link{import_env_file}}
+}
+
diff --git a/man/import_qiime_sample_data.Rd b/man/import_qiime_sample_data.Rd
new file mode 100644
index 0000000..e953efb
--- /dev/null
+++ b/man/import_qiime_sample_data.Rd
@@ -0,0 +1,49 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/IO-methods.R
+\name{import_qiime_sample_data}
+\alias{import_qiime_sample_data}
+\title{Import just \code{sample_data} file from QIIME pipeline.}
+\usage{
+import_qiime_sample_data(mapfilename)
+}
+\arguments{
+\item{mapfilename}{(Required). A character string or connection.
+That is, any suitable \code{file} argument to the \code{\link{read.table}} function.
+The name of the QIIME map
+file required for processing pyrosequencing tags
+in QIIME as well as some of the post-clustering analysis. This is a required
+input file for running QIIME. Its strict formatting specification is expected by
+this function, do not attempt to modify it manually once it has worked properly
+in QIIME.}
+}
+\value{
+A \code{sample_data} object.
+}
+\description{
+QIIME produces several files that can be analyzed in the phyloseq-package,
+This includes the map-file, which is an important \emph{input}
+to QIIME that can also indicate sample covariates. It is converted naturally to the
+sample_data component data type in phyloseq-package, based on the R data.frame.
+}
+\details{
+See \code{\link{import_qiime}} for more information about QIIME. It is also the
+suggested function for importing QIIME-produced data files.
+}
+\examples{
+ mapfile <- system.file("extdata", "master_map.txt", package = "phyloseq")
+ import_qiime_sample_data(mapfile)
+}
+\seealso{
+\code{\link{import}}
+
+\code{\link{merge_phyloseq}}
+
+\code{\link{phyloseq}}
+
+\code{\link{import_qiime}}
+
+\code{\link{import_qiime_otu_tax}}
+
+\code{\link{import_env_file}}
+}
+
diff --git a/man/import_uparse.Rd b/man/import_uparse.Rd
new file mode 100644
index 0000000..e6500c1
--- /dev/null
+++ b/man/import_uparse.Rd
@@ -0,0 +1,69 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/IO-methods.R
+\name{import_uparse}
+\alias{import_uparse}
+\title{Import \href{http://www.drive5.com/usearch/manual/opt_uparseout.html}{UPARSE file format}}
+\usage{
+import_uparse(upFile, omitChimeras = TRUE, countTable = TRUE,
+ OTUtable = TRUE, verbose = TRUE)
+}
+\arguments{
+\item{upFile}{(Required). A file location character string
+or \code{\link{connection}}
+corresponding to the file that contains the UPARSE output table.
+This is passed directly to \code{\link[data.table]{fread}}.
+Please see its \code{file} argument documentation for further
+links and details.}
+
+\item{omitChimeras}{(Optional). \code{logical(1)}.
+Default is \code{TRUE}.
+Whether to omit entries that correspond to sequences/OTUs
+that were identified as chimeras.}
+
+\item{countTable}{(Optional). \code{logical(1)}.
+Default is \code{TRUE}.
+Whether to return the result as a wide-format table
+with dimensions OTU-by-sample,
+or to leave the table in its original sparse long-format
+that might be more suitable for certain \code{\link{data.table}} operations.
+If \code{TRUE}, entries corresponding to the same sample and OTU
+have their counts summed.}
+
+\item{OTUtable}{(Optional). \code{logical(1)}.
+Default is \code{TRUE}.
+Whether to coerce the result to \code{\link{otu_table}} format,
+or leave it as a \code{\link{data.table}} format.
+The former is appropriate for most \code{\link{phyloseq}} operations,
+the latter is useful for a lot of custom operations
+and custom \code{\link[ggplot2]{ggplot}2} graphics calls.}
+
+\item{verbose}{(Optional). A \code{\link{logical}}.
+Default is \code{TRUE}.
+Should progresss messages
+be \code{\link{cat}}ted to standard out?}
+}
+\description{
+UPARSE is an algorithm for OTU-clustering implemented within usearch.
+At last check, the UPARSE algortihm was accessed via the
+\code{-cluster_otu} option flag.
+For details about installing and running usearch, please refer to the
+\href{http://drive5.com/usearch/}{usearch website}.
+For details about the output format, please refer to the
+\href{http://www.drive5.com/usearch/manual/opt_uparseout.html}{uparse format definition}.
+}
+\details{
+Because UPARSE is an external (non-R) application, there is no direct
+way to continuously check that these suggested arguments and file formats will
+remain in their current state.
+If there is a problem, please verify your version of usearch,
+create a small reproducible example of the problem,
+and post it as an issue on the
+\href{https://github.com/joey711/phyloseq/issues}{phyloseq issues tracker}.
+}
+\examples{
+###
+}
+\seealso{
+\code{\link{import_usearch_uc}}
+}
+
diff --git a/man/import_usearch_uc.Rd b/man/import_usearch_uc.Rd
new file mode 100644
index 0000000..2d21352
--- /dev/null
+++ b/man/import_usearch_uc.Rd
@@ -0,0 +1,89 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/IO-methods.R
+\name{import_usearch_uc}
+\alias{import_usearch_uc}
+\title{Import usearch table format (\code{.uc}) to OTU table}
+\usage{
+import_usearch_uc(ucfile, colRead = 9, colOTU = 10, readDelimiter = "_",
+ verbose = TRUE)
+}
+\arguments{
+\item{ucfile}{(Required). A file location character string
+or \code{\link{connection}}
+corresponding to the file that contains the usearch output table.
+This is passed directly to \code{\link{read.table}}.
+Please see its \code{file} argument documentation for further
+links and details.}
+
+\item{colRead}{(Optional). Numeric. The column index in the uc-table
+file that holds the read IDs.
+The default column index is \code{9}.}
+
+\item{colOTU}{(Optional). Numeric. The column index in the uc-table
+file that holds OTU IDs.
+The default column index is \code{10}.}
+
+\item{readDelimiter}{(Optional). An R \code{\link{regex}} as a character string.
+This should be the delimiter that separates the sample ID
+from the original ID in the demultiplexed read ID of your sequence file.
+The default is plain underscore, which in this \code{\link{regex}} context
+is \code{"_"}.}
+
+\item{verbose}{(Optional). A \code{\link{logical}}.
+Default is \code{TRUE}.
+Should progresss messages
+be \code{\link{cat}}ted to standard out?}
+}
+\description{
+UPARSE is an algorithm for OTU-clustering implemented within usearch.
+At last check, the UPARSE algortihm was accessed via the
+\code{-cluster_otu} option flag.
+For details about installing and running usearch, please refer to the
+\href{http://drive5.com/usearch/}{usearch website}.
+For details about the output format, please refer to the
+\href{http://www.drive5.com/usearch/manual/opt_uc.html}{uc format definition}.
+This importer is intended to read a particular table format output
+that is generated by usearch,
+its so-called ``cluster format'',
+a file format that is often given the \code{.uc} extension
+in usearch documentation.
+}
+\details{
+Because usearch is an external (non-R) application, there is no direct
+way to continuously check that these suggested arguments and file formats will
+remain in their current state.
+If there is a problem, please verify your version of usearch,
+create a small reproducible example of the problem,
+and post it as an issue on the phyloseq issues tracker.
+The version of usearch upon which this import function
+was created is \code{7.0.109}.
+Hopefully later versions of usearch maintain this function and format,
+but the phyloseq team has no way to guarantee this,
+and so any feedback about this will help maintain future functionality.
+For instance, it is currently
+assumed that the 9th and 10th columns of the \code{.uc} table
+hold the read-label and OTU ID, respectively;
+and it is also assumed that the delimiter between sample-name and read
+in the read-name entries is a single \code{"_"}.
+If this is not true, you may have to update these parameters,
+or even modify the current implementation of this function.
+
+Also note that there is now a UPARSE-specific output file format,
+\href{http://www.drive5.com/usearch/manual/opt_uparseout.html}{uparseout},
+and it might make more sense to create and import that file
+for use in phyloseq.
+If so, you'll want to import using the
+\code{\link{import_uparse}()} function.
+}
+\examples{
+usearchfile <- system.file("extdata", "usearch.uc", package="phyloseq")
+import_usearch_uc(usearchfile)
+}
+\seealso{
+\code{\link{import}}
+
+\code{\link{import_biom}}
+
+\code{\link{import_qiime}}
+}
+
diff --git a/man/index_reorder.Rd b/man/index_reorder.Rd
new file mode 100644
index 0000000..4f58e82
--- /dev/null
+++ b/man/index_reorder.Rd
@@ -0,0 +1,30 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/phyloseq-class.R
+\docType{methods}
+\name{index_reorder}
+\alias{index_reorder}
+\alias{index_reorder,phyloseq-method}
+\title{Force index order of phyloseq objects}
+\usage{
+index_reorder(ps, index_type)
+
+\S4method{index_reorder}{phyloseq}(ps, index_type = "both")
+}
+\arguments{
+\item{ps}{(Required). A \code{\link{phyloseq-class}} instance.}
+
+\item{index_type}{(Optional). A character string
+specifying the indices to properly order.
+Supported values are \code{c("both", "taxa", "samples")}.
+Default is \code{"both"}, meaning samples and taxa indices
+will be checked/re-ordered.}
+}
+\description{
+Force index order of phyloseq objects
+}
+\examples{
+## data("GlobalPatterns")
+## GP = index_reorder(GlobalPatterns)
+}
+\keyword{internal}
+
diff --git a/man/intersect_taxa.Rd b/man/intersect_taxa.Rd
new file mode 100644
index 0000000..109c4f6
--- /dev/null
+++ b/man/intersect_taxa.Rd
@@ -0,0 +1,33 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/phyloseq-class.R
+\name{intersect_taxa}
+\alias{intersect_taxa}
+\title{Returns the intersection of species and samples for the components of x}
+\usage{
+intersect_taxa(x)
+}
+\arguments{
+\item{x}{(Required). A \code{\link{phyloseq-class}} object
+that contains 2 or more components
+that in-turn describe species/taxa.}
+}
+\value{
+Returns a character vector of only those species that are present in
+ all species-describing components of \code{x}.
+}
+\description{
+This function is used internally as part of the infrastructure to ensure that
+component data types in a phyloseq-object have exactly the same taxa/species.
+It relies heavily on the \code{\link{Reduce}} function to determine the
+strictly common species.
+}
+\examples{
+#
+## data(GlobalPatterns)
+## head(intersect_taxa(GlobalPatterns), 10)
+}
+\seealso{
+\code{\link{Reduce}}, \code{\link{intersect}}
+}
+\keyword{internal}
+
diff --git a/man/make_network.Rd b/man/make_network.Rd
new file mode 100644
index 0000000..7a57b62
--- /dev/null
+++ b/man/make_network.Rd
@@ -0,0 +1,97 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/network-methods.R
+\name{make_network}
+\alias{make_network}
+\title{Make microbiome network (igraph)}
+\usage{
+make_network(physeq, type="samples", distance="jaccard", max.dist = 0.4,
+ keep.isolates=FALSE, ...)
+}
+\arguments{
+\item{physeq}{(Required). Default \code{NULL}.
+A \code{\link{phyloseq-class}} object,
+or \code{\link{otu_table-class}} object,
+on which \code{g} is based. \code{phyloseq-class} recommended.}
+
+\item{type}{(Optional). Default \code{"samples"}.
+ Whether the network should be samples or taxa/OTUs.
+ Supported arguments are \code{"samples"}, \code{"taxa"},
+ where \code{"taxa"} indicates using the OTUs/taxaindices,
+ whether they actually represent species or some other taxonomic rank.
+
+ NOTE: not all distance methods are supported if \code{"taxa"}
+ selected for type. For example, the UniFrac distance and DPCoA
+ cannot be calculated for taxa-wise distances, because they use
+ a taxa-wise tree as part of their calculation between samples, and
+ there is no transpose-equivalent for this tree.}
+
+\item{distance}{(Optional). Default \code{"jaccard"}.
+ Any supported argument to the \code{method} parameter of the
+ \code{\link{distance}} function is supported here.
+ Some distance methods, like \code{"unifrac"}, may take
+ a non-trivial amount of time to calculate, in which case
+ you probably want to calculate the distance matrix separately,
+ save, and then provide it as the argument to \code{distance} instead.
+ See below for alternatives).
+
+ Alternatively, if you have already calculated the sample-wise distance
+ object, the resulting \code{dist}-class object
+ can be provided as \code{distance} instead (see examples).
+
+ A third alternative is to provide a function that takes
+ a sample-by-taxa matrix (typical vegan orientation)
+ and returns a sample-wise distance
+ matrix.}
+
+\item{max.dist}{(Optional). Default \code{0.4}.
+The maximum ecological distance (as defined by \code{distance})
+allowed between two samples to still consider them ``connected''
+by an edge in the graphical model.}
+
+\item{keep.isolates}{(Optional). Default \code{FALSE}. Logical.
+Whether to keep isolates (un-connected samples, not microbial isolates)
+in the graphical model that is returned. Default results in isolates
+being removed from the object.}
+
+\item{...}{(Optional). Additional parameters passed on to \code{\link{distance}}.}
+}
+\value{
+A \code{igraph}-class object.
+}
+\description{
+A specialized function for creating a network representation of microbiomes,
+sample-wise or taxa-wise,
+based on a user-defined ecological distance and (potentially arbitrary) threshold.
+The graph is ultimately represented using the
+\code{igraph}-package.
+}
+\examples{
+# # Example plots with Enterotype Dataset
+data(enterotype)
+ig <- make_network(enterotype, max.dist=0.3)
+plot_network(ig, enterotype, color="SeqTech", shape="Enterotype", line_weight=0.3, label=NULL)
+#
+ig1 <- make_network(enterotype, max.dist=0.2)
+plot_network(ig1, enterotype, color="SeqTech", shape="Enterotype", line_weight=0.3, label=NULL)
+#
+# # Three methods of choosing/providing distance/distance-method
+# Provide method name available to distance() function
+ig <- make_network(enterotype, max.dist=0.3, distance="jaccard")
+# Provide distance object, already computed
+jaccdist <- distance(enterotype, "jaccard")
+ih <- make_network(enterotype, max.dist=0.3, distance=jaccdist)
+# Provide "custom" function.
+ii <- make_network(enterotype, max.dist=0.3, distance=function(x){vegan::vegdist(x, "jaccard")})
+# The have equal results:
+all.equal(ig, ih)
+all.equal(ig, ii)
+#
+# Try out making a trivial "network" of the 3-sample esophagus data,
+# with weighted-UniFrac as distance
+data(esophagus)
+ij <- make_network(esophagus, "samples", "unifrac", weighted=TRUE)
+}
+\seealso{
+\code{\link{plot_network}}
+}
+
diff --git a/man/merge_phyloseq.Rd b/man/merge_phyloseq.Rd
new file mode 100644
index 0000000..fbaf57b
--- /dev/null
+++ b/man/merge_phyloseq.Rd
@@ -0,0 +1,67 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/merge-methods.R
+\name{merge_phyloseq}
+\alias{merge_phyloseq}
+\title{Merge arguments into one phyloseq object.}
+\usage{
+merge_phyloseq(...)
+}
+\arguments{
+\item{...}{a comma-separated list of phyloseq objects.}
+}
+\value{
+Merges are performed by first separating higher-order objects into
+a list of their component objects; then, merging any component objects of the same class
+into one object according to the behavior desribed in \code{\link{merge_phyloseq_pair}};
+and finally, re-building a merged-object according to the constructor
+behavior of the \code{\link{phyloseq}} method. If the arguments contain only a single
+component type -- several otu_table objects, for example -- then a single merged object
+of the relevant component type is returned.
+
+Merges between 2 or more tree objects are ultimately done using
+\code{\link[ape]{consensus}} from the ape package.
+This has the potential to limit somewhat the final data object, because trees
+don't merge with other trees in the same granular manner as data tables, and
+ultimately the species/taxa in higher-order phyloseq objects will be clipped to
+what is contained in the tree. If this an issue, the tree component should
+be ommitted from the argument list.
+}
+\description{
+Takes a comma-separated list of phyloseq objects as arguments,
+and returns the most-comprehensive single phyloseq object possible.
+}
+\details{
+Higher-order objects can be created if arguments are appropriate component data
+types of different
+classes, and this should mirror the behavior of the \code{\link{phyloseq}} method,
+which is the suggested method if the goal is simply to create a higher-order
+phyloseq object from different data types (1 of each class) describing the same experiment.
+
+By contrast, this method is intended for situations in which one wants to combine
+multiple higher-order objects, or multiple core component data objects (e.g. more than one
+\code{otu_table}) that should be combined into one object.
+
+Merges are performed by first separating higher-order objects into
+a list of their component objects; then, merging any component objects of the same class
+into one object according to the behavior desribed in \code{\link{merge_phyloseq_pair}};
+and finally, building back up a merged-object according to the constructor
+behavior of the \code{\link{phyloseq}} method. If the arguments contain only a single
+component type -- several otu_table objects, for example -- then a single merged object
+of that component type is returned.
+}
+\examples{
+#
+## # Make a random complex object
+## OTU1 <- otu_table(matrix(sample(0:5,250,TRUE),25,10), taxa_are_rows=TRUE)
+## tax1 <- tax_table(matrix("abc", 30, 8))
+## map1 <- data.frame( matrix(sample(0:3,250,TRUE),25,10),
+## matrix(sample(c("a","b","c"),150,TRUE), 25, 6) )
+## map1 <- sample_data(map1)
+## exam1 <- phyloseq(OTU1, map1, tax1)
+## x <- exam1
+## x <- phyloseq(exam1)
+## y <- tax_table(exam1)
+## merge_phyloseq(x, y)
+## merge_phyloseq(y, y, y, y)
+}
+
diff --git a/man/merge_phyloseq_pair-methods.Rd b/man/merge_phyloseq_pair-methods.Rd
new file mode 100644
index 0000000..22384da
--- /dev/null
+++ b/man/merge_phyloseq_pair-methods.Rd
@@ -0,0 +1,79 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/merge-methods.R
+\docType{methods}
+\name{merge_phyloseq_pair}
+\alias{merge_phyloseq_pair}
+\alias{merge_phyloseq_pair,XStringSet,XStringSet-method}
+\alias{merge_phyloseq_pair,otu_table,otu_table-method}
+\alias{merge_phyloseq_pair,phylo,phylo-method}
+\alias{merge_phyloseq_pair,sample_data,sample_data-method}
+\alias{merge_phyloseq_pair,taxonomyTable,taxonomyTable-method}
+\title{Merge pair of phyloseq component data objects of the same class.}
+\usage{
+merge_phyloseq_pair(x, y)
+
+\S4method{merge_phyloseq_pair}{otu_table,otu_table}(x, y)
+
+\S4method{merge_phyloseq_pair}{taxonomyTable,taxonomyTable}(x, y)
+
+\S4method{merge_phyloseq_pair}{sample_data,sample_data}(x, y)
+
+\S4method{merge_phyloseq_pair}{phylo,phylo}(x, y)
+
+\S4method{merge_phyloseq_pair}{XStringSet,XStringSet}(x, y)
+}
+\arguments{
+\item{x}{A character vector of the species in object x that you want to
+keep -- OR alternatively -- a logical vector where the kept species are TRUE, and length
+is equal to the number of species in object x. If \code{species} is a named
+logical, the species retained is based on those names. Make sure they are
+compatible with the \code{taxa_names} of the object you are modifying (\code{x}).}
+
+\item{y}{Any \code{phyloseq} object.}
+}
+\value{
+A single component data object that matches \code{x} and \code{y}
+arguments. The returned object will
+contain the union of the species and/or samples of each. If there is redundant
+information between a pair of arguments of the same class, the values in \code{x} are
+used by default. Abundance values are summed for \code{otu_table} objects
+for those elements that describe the same species and sample in \code{x}
+and \code{y}.
+}
+\description{
+Internal S4 methods to combine pairs of objects of classes specified in the
+phyloseq package. These objects must be component data of the same type
+(class). This is mainly an internal method, provided to illustrate how
+merging is performed by the more general \code{\link{merge_phyloseq}} function.
+}
+\details{
+The \code{\link{merge_phyloseq}} function is recommended in general.
+
+Special note: non-identical trees are merged using \code{\link[ape]{consensus}}.
+}
+\examples{
+#
+## # merge two simulated otu_table objects.
+## x <- otu_table(matrix(sample(0:5,200,TRUE),20,10), taxa_are_rows=TRUE)
+## y <- otu_table(matrix(sample(0:5,300,TRUE),30,10), taxa_are_rows=FALSE)
+## xy <- merge_phyloseq_pair(x, y)
+## yx <- merge_phyloseq_pair(y, x)
+## # merge two simulated tax_table objects
+## x <- tax_table(matrix("abc", 20, 6))
+## y <- tax_table(matrix("def", 30, 8))
+## xy <- merge_phyloseq_pair(x, y)
+## # merge two simulated sample_data objects
+## x <- data.frame( matrix(sample(0:3,250,TRUE),25,10),
+## matrix(sample(c("a","b","c"),150,TRUE),25,6) )
+## x <- sample_data(x)
+## y <- data.frame( matrix(sample(4:6,200,TRUE),20,10),
+## matrix(sample(c("d","e","f"),120,TRUE),20,8) )
+## y <- sample_data(y)
+## merge_phyloseq_pair(x, y)
+## data.frame(merge_phyloseq_pair(x, y))
+## data.frame(merge_phyloseq_pair(y, x))
+}
+\seealso{
+\code{\link{merge_phyloseq}} \code{\link{merge_taxa}}
+}
+
diff --git a/man/merge_samples-methods.Rd b/man/merge_samples-methods.Rd
new file mode 100644
index 0000000..34720f8
--- /dev/null
+++ b/man/merge_samples-methods.Rd
@@ -0,0 +1,73 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/merge-methods.R
+\docType{methods}
+\name{merge_samples}
+\alias{merge_samples}
+\alias{merge_samples,otu_table-method}
+\alias{merge_samples,phyloseq-method}
+\alias{merge_samples,sample_data-method}
+\title{Merge samples based on a sample variable or factor.}
+\usage{
+merge_samples(x, group, fun=mean)
+
+\S4method{merge_samples}{sample_data}(x, group, fun = mean)
+
+\S4method{merge_samples}{otu_table}(x, group)
+
+\S4method{merge_samples}{phyloseq}(x, group, fun = mean)
+}
+\arguments{
+\item{x}{(Required). An instance of a phyloseq class that has sample indices. This includes
+\code{\link{sample_data-class}}, \code{\link{otu_table-class}}, and \code{\link{phyloseq-class}}.}
+
+\item{group}{(Required). Either the a single character string matching a variable name in
+the corresponding sample_data of \code{x}, or a factor with the same length as
+the number of samples in \code{x}.}
+
+\item{fun}{(Optional). The function that will be used to merge the values that
+correspond to the same group for each variable. It must take a numeric vector
+as first argument and return a single value. Default is \code{\link[base]{mean}}.
+Note that this is (currently) ignored for the otu_table, where the equivalent
+function is \code{\link[base]{sum}}, but evaluated via \code{\link[base]{rowsum}}
+for efficiency.}
+}
+\value{
+A phyloseq object that has had its sample indices merged according to
+ the factor indicated by the \code{group} argument. The output class
+ matches \code{x}.
+}
+\description{
+The purpose of this method is to merge/agglomerate the sample indices of a
+phyloseq object according to a categorical variable contained in a sample_data
+or a provided factor.
+}
+\details{
+NOTE: (\code{\link[ape]{phylo}}) trees and \code{\link{taxonomyTable-class}}
+are not modified by this function, but returned in the output object as-is.
+}
+\examples{
+#
+data(GlobalPatterns)
+GP = GlobalPatterns
+mergedGP = merge_samples(GlobalPatterns, "SampleType")
+SD = merge_samples(sample_data(GlobalPatterns), "SampleType")
+print(SD)
+print(mergedGP)
+sample_names(GlobalPatterns)
+sample_names(mergedGP)
+identical(SD, sample_data(mergedGP))
+# The OTU abundances of merged samples are summed
+# Let's investigate this ourselves looking at just the top10 most abundance OTUs...
+OTUnames10 = names(sort(taxa_sums(GP), TRUE)[1:10])
+GP10 = prune_taxa(OTUnames10, GP)
+mGP10 = prune_taxa(OTUnames10, mergedGP)
+ocean_samples = sample_names(subset(sample_data(GP), SampleType=="Ocean"))
+print(ocean_samples)
+otu_table(GP10)[, ocean_samples]
+rowSums(otu_table(GP10)[, ocean_samples])
+otu_table(mGP10)["Ocean", ]
+}
+\seealso{
+\code{\link{merge_taxa}}, code{\link{merge_phyloseq}}
+}
+
diff --git a/man/merge_taxa-methods.Rd b/man/merge_taxa-methods.Rd
new file mode 100644
index 0000000..f3bfca4
--- /dev/null
+++ b/man/merge_taxa-methods.Rd
@@ -0,0 +1,78 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/merge-methods.R
+\docType{methods}
+\name{merge_taxa}
+\alias{merge_taxa}
+\alias{merge_taxa,XStringSet-method}
+\alias{merge_taxa,otu_table-method}
+\alias{merge_taxa,phylo-method}
+\alias{merge_taxa,phyloseq-method}
+\alias{merge_taxa,sample_data-method}
+\alias{merge_taxa,taxonomyTable-method}
+\title{Merge a subset of the species in \code{x} into one species/taxa/OTU.}
+\usage{
+merge_taxa(x, eqtaxa, archetype=1)
+
+\S4method{merge_taxa}{phyloseq}(x, eqtaxa,
+ archetype = eqtaxa[which.max(taxa_sums(x)[eqtaxa])])
+
+\S4method{merge_taxa}{sample_data}(x, eqtaxa, archetype = 1L)
+
+\S4method{merge_taxa}{otu_table}(x, eqtaxa,
+ archetype = eqtaxa[which.max(taxa_sums(x)[eqtaxa])])
+
+\S4method{merge_taxa}{phylo}(x, eqtaxa, archetype = 1L)
+
+\S4method{merge_taxa}{XStringSet}(x, eqtaxa, archetype = 1L)
+
+\S4method{merge_taxa}{taxonomyTable}(x, eqtaxa, archetype = 1L)
+}
+\arguments{
+\item{x}{(Required). An object that describes species (taxa). This includes
+\code{\link{phyloseq-class}}, \code{\link{otu_table-class}}, \code{\link{taxonomyTable-class}},
+\code{\link[ape]{phylo}}.}
+
+\item{eqtaxa}{(Required). The species names, or indices, that should be merged together.
+If \code{length(eqtaxa) < 2}, then the object \code{x} will be returned
+safely unchanged.}
+
+\item{archetype}{(Optional). A single-length numeric or character.
+The index of \code{eqtaxa}, or OTU ID,
+indicating the species that should be kept to represent
+the summed/merged group of species/taxa/OTUs.
+The default is to use the OTU with the largest count total
+if counts are available, or to use \code{1}
+(the first OTU in \code{eqtaxa}) otherwise.
+If \code{archetype} is not a valid index or index-name in \code{eqtaxa},
+the first will be used, and the value in archetype will be used
+as the index-name for the new species.}
+}
+\value{
+The object, \code{x}, in its original class, but with the specified
+ species merged into one entry in all relevant components.
+}
+\description{
+Takes as input an object that describes species/taxa
+(e.g. \code{\link{phyloseq-class}}, \code{\link{otu_table-class}},
+ \code{\link{phylo-class}}, \code{\link{taxonomyTable-class}}),
+as well as
+a vector of species that should be merged.
+It is intended to be able to operate at a low-level such that
+related methods, such as \code{\link{tip_glom}} and \code{\link{tax_glom}}
+can both reliably call \code{merge_taxa} for their respective purposes.
+}
+\examples{
+#
+data(esophagus)
+tree <- phy_tree(esophagus)
+otu <- otu_table(esophagus)
+otutree0 <- phyloseq(otu, tree)
+# plot_tree(otutree0)
+otutree1 <- merge_taxa(otutree0, 1:8, 2)
+# plot_tree(esophagus, ladderize="left")
+}
+\seealso{
+\code{\link{tip_glom}}, \code{\link{tax_glom}}, \code{\link{merge_phyloseq}},
+ \code{\link{merge_samples}}
+}
+
diff --git a/man/metaMDS.Rd b/man/metaMDS.Rd
new file mode 100644
index 0000000..b9d970d
--- /dev/null
+++ b/man/metaMDS.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/allClasses.R
+\docType{data}
+\name{metaMDS}
+\alias{metaMDS}
+\title{S3 class placeholder definition (list) for metaMDS}
+\format{An object of class \code{metaMDS} of length 0.}
+\usage{
+metaMDS
+}
+\description{
+The ape package does export a version of its \code{\link[vegan]{metaMDS}}-class,
+partly because it is not really defined formally anywhere.
+Instead, it is an S3 class extended from the base class, \code{\link{list}} --
+this is a very common and easy approach --
+and proper behavior of any method taking an instance of this class
+requires exact naming conventions for element names of the list components.
+The phyloseq package does not provide any validity checks that a given phylo
+instance is valid (conforms to the conventions in the ape package)... yet.
+If problems arise, this might be considered, and they could be defined
+judiciously and within phyloseq.
+}
+\seealso{
+\code{\link[vegan]{metaMDS}}
+}
+\keyword{internal}
+
diff --git a/man/microbio_me_qiime.Rd b/man/microbio_me_qiime.Rd
new file mode 100644
index 0000000..e991d9f
--- /dev/null
+++ b/man/microbio_me_qiime.Rd
@@ -0,0 +1,99 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/IO-methods.R
+\name{microbio_me_qiime}
+\alias{microbio_me_qiime}
+\title{Import microbio.me/qiime (QIIME-DB) data package}
+\usage{
+microbio_me_qiime(zipftp, ext = ".zip", parsef = parse_taxonomy_greengenes,
+ ...)
+}
+\arguments{
+\item{zipftp}{(Required). A character string that is the full URL
+path to a zipped file that follows the file naming conventions used by
+\href{http://www.microbio.me/qiime/index.psp}{microbio.me/qiime}.
+Alternatively, you can simply provide the study number
+as a single \code{\link{integer}} or other single-length vector
+that can be \code{\link{coerce}}d to an integer;
+this function will complete the remainder of the ftp URL hosted at
+\href{http://www.microbio.me/qiime/index.psp}{microbio.me/qiime}.
+For example, instead of the full URL string,
+\code{"ftp://thebeast.colorado.edu/pub/QIIME_DB_Public_Studies/study_494_split_library_seqs_and_mapping.zip"},
+you could simply provide \code{494} or \code{"494"}
+as the first (`zipftp`) argument.}
+
+\item{ext}{(Optional). A \code{\link{character}} string of the expected
+file extension, which also indicates the compression type,
+if \code{zipftp} is a study number instead of the full path.
+Note that this argument has no effect if \code{zipftp} is the full path,
+in which case the file extension is read directly from the downloaded file.}
+
+\item{parsef}{(Optional). The type of taxonomic parsing to use for the
+OTU taxonomic classification, in the \code{.biom} file, if present.
+This is passed on to \code{\link{import_biom}}, but unlike that function
+the default parsing function is \code{\link{parse_taxonomy_greengenes}},
+rather than \code{\link{parse_taxonomy_default}}, because we know
+ahead of time that most (or all?) of the taxonomic classifications
+in the \code{microbio.me/qiime} repository will be based on
+GreenGenes.}
+
+\item{...}{(Optional, for advanced users). Additional arguments passed to
+\code{\link{download.file}}. This is mainly for non-standard links to
+resources (in this case, a zipped file) that are not being hosted by
+\href{http://www.microbio.me/qiime/index.psp}{microbio.me/qiime}.
+If you are using a FTP address or study number from their servers,
+then you shouldn't need to provide any additional arguments.}
+}
+\value{
+A \code{\link{phyloseq-class}} object if possible, a component if only a
+ component could be imported, or \code{NULL} if nothing could be imported
+ after unzipping the file. Keep in mind there is a specific naming-convention
+ that is expected based on the current state of the
+ \href{http://www.microbio.me/qiime/index.psp}{microbio.me/qiime}
+ servers. Several helpful messages are \code{\link{cat}}ted to standard out
+ to help let you know the ongoing status of the current
+ download and import process.
+}
+\description{
+Originally, this function was for accessing microbiome datasets from the
+\href{http://www.microbio.me/qiime/index.psp}{microbio.me/qiime}
+public repository from within R.
+As you can see by clicking on the above link,
+the QIIME-DB sever is down indefinitely.
+However, this function will remain supported here
+in case the FTP server goes back up,
+and also for phyloseq users that have downloaded
+one or more data packages prior to the server going down.
+}
+\examples{
+# This should return TRUE on your system if you have internet turned on
+# and a standard R installation. Indicates whether this is likely to
+# work on your system for a URL or local file, respectively.
+capabilities("http/ftp"); capabilities("fifo")
+# A working example with a local example file included in phyloseq
+zipfile = "study_816_split_library_seqs_and_mapping.zip"
+zipfile = system.file("extdata", zipfile, package="phyloseq")
+tarfile = "study_816_split_library_seqs_and_mapping.tar.gz"
+tarfile = system.file("extdata", tarfile, package="phyloseq")
+tarps = microbio_me_qiime(tarfile)
+zipps = microbio_me_qiime(zipfile)
+identical(tarps, zipps)
+tarps; zipps
+plot_heatmap(tarps)
+# An example that used to work, before the QIIME-DB server was turned off by its host.
+# # Smokers dataset
+# smokezip = "ftp://thebeast.colorado.edu/pub/QIIME_DB_Public_Studies/study_524_split_library_seqs_and_mapping.zip"
+# smokers1 = microbio_me_qiime(smokezip)
+# # Alternatively, just use the study number
+# smokers2 = microbio_me_qiime(524)
+# identical(smokers1, smokers2)
+}
+\seealso{
+See \code{\link{download.file}} and \code{\link{url}}
+ for details about URL formats --
+ including local file addresses -- that might work here.
+
+ \code{\link{import_biom}}
+
+ \code{\link{import_qiime}}
+}
+
diff --git a/man/mt-methods.Rd b/man/mt-methods.Rd
new file mode 100644
index 0000000..36f0981
--- /dev/null
+++ b/man/mt-methods.Rd
@@ -0,0 +1,93 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/multtest-wrapper.R
+\docType{methods}
+\name{mt}
+\alias{mt}
+\alias{mt,otu_table,character-method}
+\alias{mt,otu_table,factor-method}
+\alias{mt,otu_table,integer-method}
+\alias{mt,otu_table,logical-method}
+\alias{mt,otu_table,numeric-method}
+\alias{mt,phyloseq,ANY-method}
+\title{Multiple testing of taxa abundance according to sample categories/classes}
+\usage{
+mt(physeq, classlabel, minPmaxT = "minP", method = "fdr", ...)
+
+\S4method{mt}{phyloseq,ANY}(physeq, classlabel, minPmaxT = "minP",
+ method = "fdr", ...)
+
+\S4method{mt}{otu_table,integer}(physeq, classlabel, minPmaxT = "minP",
+ method = "fdr", ...)
+
+\S4method{mt}{otu_table,numeric}(physeq, classlabel, minPmaxT = "minP",
+ method = "fdr", ...)
+
+\S4method{mt}{otu_table,logical}(physeq, classlabel, minPmaxT = "minP",
+ method = "fdr", ...)
+
+\S4method{mt}{otu_table,character}(physeq, classlabel, minPmaxT = "minP",
+ method = "fdr", ...)
+
+\S4method{mt}{otu_table,factor}(physeq, classlabel, minPmaxT = "minP",
+ method = "fdr", ...)
+}
+\arguments{
+\item{physeq}{(Required). \code{\link{otu_table-class}} or \code{\link{phyloseq-class}}.
+In this multiple testing framework, different taxa correspond to variables
+(hypotheses), and samples to observations.}
+
+\item{classlabel}{(Required). A single character index of the sample-variable
+in the \code{\link{sample_data}} of \code{physeq} that will be used for multiple testing.
+Alternatively, \code{classlabel} can be a custom integer (or numeric coercable
+to an integer), character, or factor with
+length equal to \code{nsamples(physeq)}.
+
+NOTE: the default test applied to each taxa is a two-sample two-sided
+\code{\link{t.test}}, WHICH WILL FAIL with an error if you provide a data variable
+(or custom vector) that contains MORE THAN TWO classes. One alternative to consider
+is an F-test, by specifying \code{test="f"} as an additional argument. See
+the first example below, and/or further documentation of
+\code{\link[multtest]{mt.maxT}} or \code{\link[multtest]{mt.minP}}
+for other options and formal details.}
+
+\item{minPmaxT}{(Optional). Character string. \code{"mt.minP"} or \code{"mt.maxT"}.
+Default is to use \code{"\link[multtest]{mt.minP}"}.}
+
+\item{method}{(Optional). Additional multiple-hypthesis correction methods.
+A character vector from the set \code{\link[stats]{p.adjust.methods}}.
+Default is \code{"fdr"}, for the Benjamini and Hochberg (1995) method
+to control False Discovery Rate (FDR). This argument is passed on to
+\code{\link[stats]{p.adjust}}, please see that documentation for more details.}
+
+\item{...}{(Optional). Additional arguments, forwarded to
+\code{\link[multtest]{mt.maxT}} or \code{\link[multtest]{mt.minP}}}
+}
+\value{
+A dataframe with components specified in the documentation for
+ \code{\link[multtest]{mt.maxT}} or \code{\link[multtest]{mt.minP}}, respectively.
+}
+\description{
+Please note that it is up to you to perform any necessary
+normalizing / standardizing transformations prior to these tests.
+See for instance \code{\link{transform_sample_counts}}.
+}
+\examples{
+## # Simple example, testing genera that sig correlate with Enterotypes
+data(enterotype)
+# Filter samples that don't have Enterotype
+x <- subset_samples(enterotype, !is.na(Enterotype))
+# (the taxa are at the genera level in this dataset)
+res = mt(x, "Enterotype", method=c("fdr", "bonferroni"), test="f", B=300)
+head(res, 10)
+## # Not surprisingly, Prevotella and Bacteroides top the list.
+## # Different test, multiple-adjusted t-test, whether samples are ent-2 or not.
+## mt(x, get_variable(x, "Enterotype")==2)
+}
+\seealso{
+\code{\link[multtest]{mt.maxT}}
+
+\code{\link[multtest]{mt.minP}}
+
+\code{\link[stats]{p.adjust}}
+}
+
diff --git a/man/nodeplotblank.Rd b/man/nodeplotblank.Rd
new file mode 100644
index 0000000..f9e9bf4
--- /dev/null
+++ b/man/nodeplotblank.Rd
@@ -0,0 +1,40 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/plot-methods.R
+\name{nodeplotblank}
+\alias{nodeplotblank}
+\title{Function to avoid plotting node labels}
+\usage{
+nodeplotblank(p, nodelabdf)
+}
+\arguments{
+\item{p}{(Required). The \code{\link{plot_tree}} graphic.}
+
+\item{nodelabdf}{(Required). The \code{data.frame} produced internally in
+\code{link{plot_tree}} to use as data for creating ggplot2-based tree graphics.}
+}
+\value{
+The same input object, \code{p}, provided as input. Unmodified.
+}
+\description{
+Unlike, \code{\link{nodeplotdefault}} and \code{\link{nodeplotboot}},
+this function does not return a function, but instead is provided
+directly to the \code{nodelabf} argument of \code{\link{plot_tree}} to
+ensure that node labels are not added to the graphic.
+Please note that you do not need to create or obtain the arguments to
+this function. Instead, you can provide this function directly to
+\code{\link{plot_tree}} and it will know what to do with it. Namely,
+use it to avoid plotting any node labels.
+}
+\examples{
+data("esophagus")
+plot_tree(esophagus)
+plot_tree(esophagus, nodelabf=nodeplotblank)
+}
+\seealso{
+\code{\link{nodeplotdefault}}
+
+\code{\link{nodeplotboot}}
+
+\code{\link{plot_tree}}
+}
+
diff --git a/man/nodeplotboot.Rd b/man/nodeplotboot.Rd
new file mode 100644
index 0000000..ab66827
--- /dev/null
+++ b/man/nodeplotboot.Rd
@@ -0,0 +1,61 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/plot-methods.R
+\name{nodeplotboot}
+\alias{nodeplotboot}
+\title{Generates a function for labeling bootstrap values on a phylogenetic tree.}
+\usage{
+nodeplotboot(highthresh=95L, lowcthresh=50L, size=2L, hjust=-0.2)
+}
+\arguments{
+\item{highthresh}{(Optional). A single integer between 0 and 100.
+Any bootstrap values above this threshold will be annotated as
+a black filled circle on the node, rather than the bootstrap
+percentage value itself.}
+
+\item{lowcthresh}{(Optional). A single integer between 0 and 100,
+less than \code{highthresh}. Any bootstrap values below this value
+will not be added to the graphic. Set to 0 or below to add all
+available values.}
+
+\item{size}{(Optional). Numeric. Should be positive. The
+size parameter used to control the text size of taxa labels.
+Default is \code{2}. These are ggplot2 sizes.}
+
+\item{hjust}{(Optional). The horizontal justification of the
+node labels. Default is \code{-0.2}.}
+}
+\value{
+A function that can add a bootstrap-values layer to the tree graphic.
+ The values are represented in two ways; either as black filled circles
+ indicating very high-confidence nodes, or the bootstrap value itself
+ printed in small text next to the node on the tree.
+}
+\description{
+Is not a labeling function itself, but returns one.
+The returned function is specialized for labeling bootstrap values.
+Note that the function that
+is returned has two completely different arguments from the four listed here:
+the plot object already built by earlier steps in
+\code{\link{plot_tree}}, and the \code{\link{data.frame}}
+that contains the relevant plotting data for the nodes
+(especially \code{x, y, label}),
+respectively.
+See \code{\link{nodeplotdefault}} for a simpler example.
+The main purpose of this and \code{\link{nodeplotdefault}} is to
+provide a useful default function generator for arbitrary and
+bootstrap node labels, respectively, and also to act as
+examples of functions that can successfully interact with
+\code{\link{plot_tree}} to add node labels to the graphic.
+}
+\examples{
+nodeplotboot()
+nodeplotboot(3, -0.4)
+}
+\seealso{
+\code{\link{nodeplotdefault}}
+
+\code{\link{nodeplotblank}}
+
+\code{\link{plot_tree}}
+}
+
diff --git a/man/nodeplotdefault.Rd b/man/nodeplotdefault.Rd
new file mode 100644
index 0000000..536dfce
--- /dev/null
+++ b/man/nodeplotdefault.Rd
@@ -0,0 +1,48 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/plot-methods.R
+\name{nodeplotdefault}
+\alias{nodeplotdefault}
+\title{Generates a default node-label function}
+\usage{
+nodeplotdefault(size=2L, hjust=-0.2)
+}
+\arguments{
+\item{size}{(Optional). Numeric. Should be positive. The
+size parameter used to control the text size of taxa labels.
+Default is \code{2}. These are ggplot2 sizes.}
+
+\item{hjust}{(Optional). The horizontal justification of the
+node labels. Default is \code{-0.2}.}
+}
+\value{
+A function that can add a node-label layer to a graphic.
+}
+\description{
+Is not a labeling function itself, but returns one.
+The returned function is capable of adding
+whatever label is on a node. Note that the function that
+is returned has two completely different arguments to those listed here:
+the plot object already built by earlier steps in
+\code{\link{plot_tree}}, and the \code{\link{data.frame}}
+that contains the relevant plotting data for the nodes
+(especially \code{x, y, label}),
+respectively.
+See \code{\link{nodeplotboot}} for a more sophisticated example.
+The main purpose of this and \code{\link{nodeplotboot}} is to
+provide a useful default function generator for arbitrary and
+bootstrap node labels, respectively, and also to act as
+examples of functions that will successfully interact with
+\code{\link{plot_tree}} to add node labels to the graphic.
+}
+\examples{
+nodeplotdefault()
+nodeplotdefault(3, -0.4)
+}
+\seealso{
+\code{\link{nodeplotboot}}
+
+\code{\link{nodeplotblank}}
+
+\code{\link{plot_tree}}
+}
+
diff --git a/man/nsamples-methods.Rd b/man/nsamples-methods.Rd
new file mode 100644
index 0000000..248e674
--- /dev/null
+++ b/man/nsamples-methods.Rd
@@ -0,0 +1,45 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/almostAllAccessors.R
+\docType{methods}
+\name{nsamples}
+\alias{nsamples}
+\alias{nsamples,ANY-method}
+\alias{nsamples,otu_table-method}
+\alias{nsamples,phyloseq-method}
+\alias{nsamples,sample_data-method}
+\title{Get the number of samples.}
+\usage{
+nsamples(physeq)
+
+\S4method{nsamples}{ANY}(physeq)
+
+\S4method{nsamples}{phyloseq}(physeq)
+
+\S4method{nsamples}{otu_table}(physeq)
+
+\S4method{nsamples}{sample_data}(physeq)
+}
+\arguments{
+\item{physeq}{A \code{\link{phyloseq-class}}, \code{\link{sample_data}},
+or \code{\link{otu_table-class}}.}
+}
+\value{
+An integer indicating the total number of samples.
+}
+\description{
+Get the number of samples.
+}
+\examples{
+#
+data("esophagus")
+tree <- phy_tree(esophagus)
+OTU1 <- otu_table(esophagus)
+nsamples(OTU1)
+physeq1 <- phyloseq(OTU1, tree)
+nsamples(physeq1)
+}
+\seealso{
+\code{\link{taxa_names}}, \code{\link{sample_names}},
+ \code{\link{ntaxa}}
+}
+
diff --git a/man/ntaxa-methods.Rd b/man/ntaxa-methods.Rd
new file mode 100644
index 0000000..46345c8
--- /dev/null
+++ b/man/ntaxa-methods.Rd
@@ -0,0 +1,48 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/almostAllAccessors.R
+\docType{methods}
+\name{ntaxa}
+\alias{ntaxa}
+\alias{ntaxa,ANY-method}
+\alias{ntaxa,XStringSet-method}
+\alias{ntaxa,otu_table-method}
+\alias{ntaxa,phylo-method}
+\alias{ntaxa,phyloseq-method}
+\alias{ntaxa,taxonomyTable-method}
+\title{Get the number of taxa/species.}
+\usage{
+ntaxa(physeq)
+
+\S4method{ntaxa}{ANY}(physeq)
+
+\S4method{ntaxa}{phyloseq}(physeq)
+
+\S4method{ntaxa}{otu_table}(physeq)
+
+\S4method{ntaxa}{taxonomyTable}(physeq)
+
+\S4method{ntaxa}{phylo}(physeq)
+
+\S4method{ntaxa}{XStringSet}(physeq)
+}
+\arguments{
+\item{physeq}{\code{\link{phyloseq-class}}, \code{\link{otu_table-class}},
+\code{\link{taxonomyTable-class}}, or
+\code{\link[ape]{phylo}}}
+}
+\value{
+An integer indicating the number of taxa / species.
+}
+\description{
+Get the number of taxa/species.
+}
+\examples{
+data("esophagus")
+ntaxa(esophagus)
+phy_tree(esophagus)
+ntaxa(phy_tree(esophagus))
+}
+\seealso{
+taxa_names
+}
+
diff --git a/man/ordinate.Rd b/man/ordinate.Rd
new file mode 100644
index 0000000..7d5440e
--- /dev/null
+++ b/man/ordinate.Rd
@@ -0,0 +1,156 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ordination-methods.R
+\name{ordinate}
+\alias{ordinate}
+\title{Perform an ordination on phyloseq data}
+\usage{
+ordinate(physeq, method = "DCA", distance = "bray", formula = NULL, ...)
+}
+\arguments{
+\item{physeq}{(Required). Phylogenetic sequencing data
+(\code{\link{phyloseq-class}}). The data on which you want to perform
+the ordination. In general, these methods will be based in some fashion on
+the abundance table ultimately stored as a contingency matrix
+(\code{\link{otu_table-class}}). If you're able to import data into
+\code{\link{phyloseq-class}} format, than you don't need to worry, as an
+\code{otu_table} is a required component of this class. In addition, some
+ordination methods require additional data, like a constraining variable
+or phylogenetic tree. If that is the case, the relevant data should be
+included in \code{physeq} prior to running. Integrating the data in this way
+also results in these different data components being checked for validity
+and completeness by the method.}
+
+\item{method}{(Optional). A character string. Default is \code{"DCA"}.
+
+ Currently supported method options are:
+\code{c("DCA", "CCA", "RDA", "CAP", "DPCoA", "NMDS", "MDS", "PCoA")}
+
+\describe{
+ \item{DCA}{Performs detrended correspondence analysis using\code{\link{decorana}}}
+ \item{CCA}{Performs correspondence analysis,
+ or optionally, constrained correspondence analysis
+ (a.k.a. canonical correspondence analysis),
+ via \code{\link[vegan]{cca}}}
+ \item{RDA}{Performs redundancy analysis, or optionally
+ principal components analysis, via \code{\link[vegan]{rda}}}
+ \item{CAP}{[Partial] Constrained Analysis of Principal Coordinates
+ or distance-based RDA, via \code{\link[vegan]{capscale}}.
+ See \code{\link[phyloseq]{capscale.phyloseq}} for more details.
+ In particular, a \code{\link{formula}} argument must be provided.}
+ \item{DPCoA}{Performs Double Principle Coordinate Analysis using a
+ (corrected, if necessary) phylogenetic/patristic distance
+ between species. The calculation is performed by
+ \code{\link{DPCoA}}(), which ultimately uses
+ \code{\link[ade4]{dpcoa}} after making the appropriate
+ accessions/corrections of the data.}
+ \item{NMDS}{Performs Non-metric MultiDimenstional Scaling of a sample-wise
+ ecological distance matrix onto a user-specified number of axes, \code{k}.
+ By default, \code{k=2}, but this can be modified as a supplementary argument.
+ This method is ultimately carried out by \code{\link{metaMDS}} after the
+ appropriate accessions and distance calculations.
+ Because \code{metaMDS} includes its own distance
+ calculation wrappers to \code{\link[vegan]{vegdist}}, and these provide
+ additional functionality in the form of species scores,
+ \code{ordinate} will pass-on the \code{distance}
+ argument to \code{metaMDS} if it is among the
+ supported \code{vegdist} methods. However, all distance methods
+ supported by \code{\link{distance}} are supported here,
+ including \code{"unifrac"} (the default) and \code{"DPCoA"}.}
+ \item{MDS/PCoA}{Performs principal coordinate analysis
+ (also called principle coordinate decomposition,
+ multidimensional scaling (MDS), or classical scaling)
+ of a distance matrix (Gower 1966),
+ including two correction methods for negative eigenvalues.
+ See
+ \code{\link[ape]{pcoa}} for further details.
+ }
+}}
+
+\item{distance}{(Optional). A character string. Default is \code{"bray"}.
+ The name of a supported \code{\link{distance}} method;
+ or, alternatively,
+ a pre-computed \code{\link{dist}}-class object.
+ This argument is only utilized
+ if a distance matrix is required by the ordination method specified by the
+ \code{method} argument (above).
+
+ Any supported \code{\link{distance}} methods
+ are supported arguments to \code{distance} here.
+ See \code{\link{distance}} for more details, examples.}
+
+\item{formula}{(Optional). A model \code{\link{formula}}.
+Only relevant for certain ordination methods.
+The left hand side is ignored, defined by
+the \code{physeq} and \code{distance} arguemnts.
+The right hand side gives the constraining variables,
+and conditioning variables can be given
+within a special function \code{Condition}.
+See \code{\link[vegan]{cca}} or \code{\link[vegan]{capscale}}
+for examples/details.}
+
+\item{...}{(Optional). Additional arguments to supporting functions. For
+example, the additional argument \code{weighted=TRUE} would be passed on
+to \code{\link{UniFrac}} if \code{"unifrac"} were chosen as the
+\code{distance} option and \code{"MDS"} as the ordination \code{method}
+option. Alternatively, if \code{"DCA"} were chosen as the
+ordination \code{method} option, additional arguments would be passed on
+to the relevant ordination function, \code{\link{decorana}}, for example.}
+}
+\value{
+An ordination object. The specific class of the returned object depends upon the
+ ordination method, as well as the function/package that is called internally
+ to perform it.
+ As a general rule, any of the ordination classes
+ returned by this function will be recognized by downstream tools in the
+ \code{phyloseq} package, for example the ordination plotting
+ function, \code{\link{plot_ordination}}.
+}
+\description{
+This function wraps several commonly-used ordination methods. The type of
+ordination depends upon the argument to \code{method}. Try
+\code{ordinate("help")} or \code{ordinate("list")} for the currently
+supported method options.
+}
+\examples{
+# See http://joey711.github.io/phyloseq/plot_ordination-examples
+# for many more examples.
+# plot_ordination(GP, ordinate(GP, "DCA"), "samples", color="SampleType")
+}
+\seealso{
+\href{http://joey711.github.io/phyloseq/plot_ordination-examples}{The plot_ordination Tutorial}
+
+ Related component ordination functions described within phyloseq:
+
+ \code{\link{DPCoA}}
+
+ Described/provided by other packages:
+
+ \code{\link{cca}}/\code{\link{rda}}, \code{\link{decorana}}, \code{\link{metaMDS}},
+ \code{\link{pcoa}}, \code{\link[vegan]{capscale}}
+
+ NMDS and MDS/PCoA both operate on distance matrices, typically based on some
+ pairwise comparison of the microbiomes in an experiment/project. There are
+ a number of common methods to use to calculate these pairwise distances, and
+ the most convenient function (from a \code{phyloseq} point of view) for calculating
+ these distance matrices is the
+
+ \code{\link{distance}}
+
+ function. It can be
+ thought of as a distance / dissimilarity-index companion function for
+ \code{ordinate}, and indeed the distance options provided to \code{ordinate}
+ are often simply passed on to \code{\link{distance}}.
+
+ A good quick summary of ordination is provided in the introductory vignette
+ for vegan:
+
+ \href{http://cran.r-project.org/web/packages/vegan/vignettes/intro-vegan.pdf}{vegan introductory vignette}
+
+ The following \code{R} task views are also useful for understanding the
+ available tools in \code{R}:
+
+\href{http://cran.r-project.org/web/views/Environmetrics.html}{Analysis of Ecological and Environmental Data}
+
+\href{http://cran.r-project.org/web/views/Multivariate.html}{Multivariate Statistics}
+}
+
diff --git a/man/otu_table-class.Rd b/man/otu_table-class.Rd
new file mode 100644
index 0000000..f5aaae2
--- /dev/null
+++ b/man/otu_table-class.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/allClasses.R
+\docType{class}
+\name{otu_table-class}
+\alias{otu_table-class}
+\title{The S4 class for storing taxa-abundance information.}
+\description{
+Because orientation of these tables can vary by method, the orientation is
+defined explicitly in the \code{taxa_are_rows} slot (a logical).
+The \code{otu_table} class inherits the \code{\link{matrix}} class to store
+abundance values.
+Various standard subset and assignment nomenclature has been extended to apply
+to the \code{otu_table} class, including square-bracket, \code{\link{t}}, etc.
+}
+\details{
+\describe{
+ \item{taxa_are_rows}{
+ A single logical specifying the orientation of the abundance table.
+ }
+
+\item{.Data}{This slot is inherited from the \code{\link{matrix}} class.}
+ }
+}
+
diff --git a/man/otu_table-methods.Rd b/man/otu_table-methods.Rd
new file mode 100644
index 0000000..56d221f
--- /dev/null
+++ b/man/otu_table-methods.Rd
@@ -0,0 +1,59 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/otuTable-class.R
+\docType{methods}
+\name{otu_table}
+\alias{otu_table}
+\alias{otu_table,ANY-method}
+\alias{otu_table,data.frame-method}
+\alias{otu_table,matrix-method}
+\alias{otu_table,otu_table-method}
+\alias{otu_table,phyloseq-method}
+\title{Build or access the otu_table.}
+\usage{
+otu_table(object, taxa_are_rows, errorIfNULL=TRUE)
+
+\S4method{otu_table}{phyloseq}(object, errorIfNULL = TRUE)
+
+\S4method{otu_table}{otu_table}(object, errorIfNULL = TRUE)
+
+\S4method{otu_table}{matrix}(object, taxa_are_rows)
+
+\S4method{otu_table}{data.frame}(object, taxa_are_rows)
+
+\S4method{otu_table}{ANY}(object, errorIfNULL = TRUE)
+}
+\arguments{
+\item{object}{(Required). An integer matrix, \code{\link{otu_table-class}},
+or \code{\link{phyloseq-class}}.}
+
+\item{taxa_are_rows}{(Conditionally optional). Logical; of length 1. Ignored
+unless \code{object} is a matrix, in which case it is is required.}
+
+\item{errorIfNULL}{(Optional). Logical. Should the accessor stop with
+an error if the slot is empty (\code{NULL})? Default \code{TRUE}. Ignored
+if \code{object} argument is a matrix (constructor invoked instead).}
+}
+\value{
+An \code{\link{otu_table-class}} object.
+}
+\description{
+This is the suggested method for both constructing and accessing
+Operational Taxonomic Unit (OTU) abundance (\code{\link{otu_table-class}}) objects.
+When the first
+argument is a matrix, otu_table() will attempt to create and return an
+otu_table-class object,
+which further depends on whether or not \code{taxa_are_rows} is provided as an
+additional argument.
+Alternatively, if the first argument is an experiment-level (\code{\link{phyloseq-class}})
+object, then the corresponding \code{otu_table} is returned.
+}
+\examples{
+#
+# data(GlobalPatterns)
+# otu_table(GlobalPatterns)
+}
+\seealso{
+\code{\link{phy_tree}}, \code{\link{sample_data}}, \code{\link{tax_table}}
+ \code{\link{phyloseq}}, \code{\link{merge_phyloseq}}
+}
+
diff --git a/man/parseTaxonomy-functions.Rd b/man/parseTaxonomy-functions.Rd
new file mode 100644
index 0000000..905e7f3
--- /dev/null
+++ b/man/parseTaxonomy-functions.Rd
@@ -0,0 +1,71 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/IO-methods.R
+\name{parse_taxonomy_default}
+\alias{parse_taxonomy_default}
+\alias{parse_taxonomy_greengenes}
+\alias{parse_taxonomy_qiime}
+\title{Parse elements of a taxonomy vector}
+\usage{
+parse_taxonomy_default(char.vec)
+
+parse_taxonomy_greengenes(char.vec)
+
+parse_taxonomy_qiime(char.vec)
+}
+\arguments{
+\item{char.vec}{(Required). A single character vector of taxonomic
+ranks for a single OTU, unprocessed (ugly).}
+}
+\value{
+A character vector in which each element is a different
+ taxonomic rank of the same OTU, and each element name is the name of
+ the rank level. For example, an element might be \code{"Firmicutes"}
+ and named \code{"phylum"}.
+ These parsed, named versions of the taxonomic vector should
+ reflect embedded information, naming conventions,
+ desired length limits, etc; or in the case of \code{\link{parse_taxonomy_default}},
+ not modified at all and given dummy rank names to each element.
+}
+\description{
+These are provided as both example and default functions for
+parsing a character vector of taxonomic rank information for a single taxa.
+As default functions, these are intended for cases where the data adheres to
+the naming convention used by greengenes
+(\url{http://greengenes.lbl.gov/cgi-bin/nph-index.cgi})
+or where the convention is unknown, respectively.
+To work, these functions -- and any similar custom function you may want to
+create and use -- must take as input a single character vector of taxonomic
+ranks for a single OTU, and return a \strong{named} character vector that has
+been modified appropriately (according to known naming conventions,
+desired length limits, etc.
+The length (number of elements) of the output named vector does \strong{not}
+need to be equal to the input, which is useful for the cases where the
+source data files have extra meaningless elements that should probably be
+removed, like the ubiquitous
+``Root'' element often found in greengenes/QIIME taxonomy labels.
+In the case of \code{parse_taxonomy_default}, no naming convention is assumed and
+so dummy rank names are added to the vector.
+More usefully if your taxonomy data is based on greengenes, the
+\code{parse_taxonomy_greengenes} function clips the first 3 characters that
+identify the rank, and uses these to name the corresponding element according
+to the appropriate taxonomic rank name used by greengenes
+(e.g. \code{"p__"} at the beginning of an element means that element is
+the name of the phylum to which this OTU belongs).
+Most importantly, the expectations for these functions described above
+make them compatible to use during data import,
+specifcally the \code{\link{import_biom}} function, but
+it is a flexible structure that will be implemented soon for all phyloseq
+import functions that deal with taxonomy (e.g. \code{\link{import_qiime}}).
+}
+\examples{
+ taxvec1 = c("Root", "k__Bacteria", "p__Firmicutes", "c__Bacilli", "o__Bacillales", "f__Staphylococcaceae")
+ parse_taxonomy_default(taxvec1)
+ parse_taxonomy_greengenes(taxvec1)
+ taxvec2 = c("Root;k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Staphylococcaceae")
+ parse_taxonomy_qiime(taxvec2)
+}
+\seealso{
+\code{\link{import_biom}}
+ \code{\link{import_qiime}}
+}
+
diff --git a/man/pcoa.Rd b/man/pcoa.Rd
new file mode 100644
index 0000000..98ae3be
--- /dev/null
+++ b/man/pcoa.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/allClasses.R
+\docType{data}
+\name{pcoa}
+\alias{pcoa}
+\title{S3 class for ape-calculated MDS results}
+\format{An object of class \code{pcoa} of length 0.}
+\usage{
+pcoa
+}
+\description{
+Nothing to import, because ape doesn't (yet) export this S3 class.
+We will define it here, but keep it internal.
+For the moment, its only use is for proper dispatch in our extensions
+to the scores S3 generic from vegan,
+for generic extraction of coordinates and possibly other features from
+any ordination results.
+}
+\keyword{internal}
+
diff --git a/man/phy_tree-methods.Rd b/man/phy_tree-methods.Rd
new file mode 100644
index 0000000..2b0fc63
--- /dev/null
+++ b/man/phy_tree-methods.Rd
@@ -0,0 +1,58 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/almostAllAccessors.R
+\docType{methods}
+\name{phy_tree}
+\alias{phy_tree}
+\alias{phy_tree,ANY-method}
+\alias{phy_tree,phylo-method}
+\title{Retrieve phylogenetic tree (\code{\link[ape]{phylo}}-class) from object.}
+\usage{
+phy_tree(physeq, errorIfNULL=TRUE)
+
+\S4method{phy_tree}{ANY}(physeq, errorIfNULL = TRUE)
+
+\S4method{phy_tree}{phylo}(physeq)
+}
+\arguments{
+\item{physeq}{(Required). An instance of phyloseq-class
+that contains a phylogenetic tree. If physeq is a phylogenetic
+tree (a component data class), then it is returned as-is.}
+
+\item{errorIfNULL}{(Optional). Logical. Should the accessor stop with
+an error if the slot is empty (\code{NULL})? Default \code{TRUE}.}
+}
+\value{
+The \code{\link[ape]{phylo}}-class object contained within \code{physeq};
+ or NULL if \code{physeq} does not have a tree.
+ This method stops with an error in the latter NULL case be default, which
+ can be over-ridden by changing the value of \code{errorIfNULL} to \code{FALSE}.
+}
+\description{
+This is the suggested method
+for accessing
+the phylogenetic tree, (\code{\link[ape]{phylo}}-class) from a \code{\link{phyloseq-class}}.
+Like other accessors (see See Also, below), the default behavior of this method
+is to stop with an
+error if \code{physeq} is a \code{phyloseq-class} but does not
+contain a phylogenetic tree (the component data you are trying to access in this case).
+}
+\details{
+Note that the tip labels should be named to match the
+\code{taxa_names} of the other objects to which it is going to be paired.
+The \code{\link{phyloseq}} constructor automatically checks for
+exact agreement in the set of species described by the phlyogenetic tree
+and the other components (taxonomyTable, otu_table),
+and trims as-needed. Thus, the tip.labels in a phylo object
+must be named to match the results of
+\code{\link{taxa_names}} of the other objects to which it will ultimately be paired.
+}
+\examples{
+ data(GlobalPatterns)
+ phy_tree(GlobalPatterns)
+}
+\seealso{
+\code{\link{otu_table}}, \code{\link{sample_data}}, \code{\link{tax_table}}
+ \code{\link{refseq}},
+ \code{\link{phyloseq}}, \code{\link{merge_phyloseq}}
+}
+
diff --git a/man/phylo-class.Rd b/man/phylo-class.Rd
new file mode 100644
index 0000000..ae12713
--- /dev/null
+++ b/man/phylo-class.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/allClasses.R
+\name{phylo-class}
+\alias{phylo-class}
+\title{An S4 placeholder of the main phylogenetic tree class from the ape package.}
+\description{
+See the \code{\link[ape]{ape}} package for details about this type of
+representation of a phylogenetic tree.
+It is used throughout the ape package.
+}
+\seealso{
+\code{\link[ape]{phylo}}, \code{\link{setOldClass}}
+}
+
diff --git a/man/phylo.Rd b/man/phylo.Rd
new file mode 100644
index 0000000..aa428ed
--- /dev/null
+++ b/man/phylo.Rd
@@ -0,0 +1,36 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/allClasses.R
+\docType{data}
+\name{phylo}
+\alias{phylo}
+\title{S3 class placeholder definition (list) for phylogenetic trees.}
+\format{An object of class \code{phylo} of length 0.}
+\usage{
+phylo
+}
+\description{
+The ape package does not export a version of its \code{\link[ape]{phylo}}-class,
+partly because it is not really defined formally anywhere.
+Instead, it is an S3 class extended from the base class, \code{\link{list}} --
+this is a very common and easy approach --
+and proper behavior of any method taking an instance of this class
+requires exact naming conventions for element names of the components.
+The phyloseq package does not provide any validity checks that a given phylo
+instance is valid (conforms to the conventions in the ape package). Yet.
+If problems arise, this might be considered, and they could be defined
+judiciously and within phyloseq.
+Similarly, if a formal definition for the the phylo-class is ever exported
+by ape, the current philosophy of phyloseq would be to remove this
+internal definition and import the former. Note that there is still some
+work going on for the phylobase package, which is addressing these same
+exact issues for S4 phylogenetic tree interaction.
+A very large number of packages (around 60 at my last count), depend on ape,
+making it easily the de facto standard for representing phylogenetic trees in R;
+and the phyloseq team would prefer to use any exported definitions from
+the ape package if possible and available.
+}
+\seealso{
+\code{\link[ape]{phylo}}
+}
+\keyword{internal}
+
diff --git a/man/phyloseq-class.Rd b/man/phyloseq-class.Rd
new file mode 100644
index 0000000..1df0e61
--- /dev/null
+++ b/man/phyloseq-class.Rd
@@ -0,0 +1,51 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/allClasses.R
+\docType{class}
+\name{phyloseq-class}
+\alias{phyloseq-class}
+\title{The main experiment-level class for phyloseq data}
+\description{
+Contains all currently-supported component data classes:
+\code{\link{otu_table-class}},
+\code{\link{sample_data-class}},
+\code{\link{taxonomyTable-class}} (\code{"tax_table"} slot),
+\code{\link[ape]{phylo}}-class (\code{"phy_tree"} slot),
+and the \code{\link[Biostrings]{XStringSet-class}} (\code{"refseq"} slot).
+There are several advantages
+to storing your phylogenetic sequencing experiment as an instance of the
+phyloseq class, not the least of which is that it is easy to return to the
+data later and feel confident that the different data types ``belong'' to
+one another. Furthermore, the \code{\link{phyloseq}} constructor ensures that
+the different data components have compatible indices (e.g. OTUs and samples),
+and performs the necessary trimming automatically when you create your
+``experiment-level'' object. Downstream analyses are aware of which data
+classes they require -- and where to find them -- often making your
+\code{phyloseq-class} object the only data argument required for analysis and plotting
+functions (although there are many options and parameter arguments available
+to you).
+}
+\details{
+In the case of missing component data, the slots are set to \code{NULL}. As
+soon as a \code{phyloseq-class} object is to be updated with new component
+data (previously missing/\code{NULL} or not), the indices of all components
+are re-checked for compatibility and trimmed if necessary. This is to ensure
+by design that components describe the same taxa/samples, and also that these
+trimming/validity checks do not need to be repeated in downstream analyses.
+
+slots:
+\describe{
+ \item{otu_table}{a single object of class otu_table.}
+ \item{sam_data}{ a single object of class sample_data.}
+ \item{tax_table}{ a single object of class taxonomyTable.}
+ \item{phy_tree}{ a single object of the \code{\link[ape]{phylo}}-class, from the ape package.}
+ \item{refseq}{ a biological sequence set object of a class that
+ inherits from the \code{\link[Biostrings]{XStringSet-class}}, from the Biostrings package.}
+}
+}
+\seealso{
+The constructor, \code{\link{phyloseq}},
+ the merger \code{\link{merge_phyloseq}}, and also the component
+ constructor/accessors \code{\link{otu_table}}, \code{\link{sample_data}},
+ \code{\link{tax_table}}, \code{\link{phy_tree}}, and \code{\link{refseq}}.
+}
+
diff --git a/man/phyloseq-deprecated.Rd b/man/phyloseq-deprecated.Rd
new file mode 100644
index 0000000..2030349
--- /dev/null
+++ b/man/phyloseq-deprecated.Rd
@@ -0,0 +1,111 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/deprecated_functions.R
+\docType{package}
+\name{phyloseq-deprecated}
+\alias{deprecated_phyloseq_function}
+\alias{filterfunSample}
+\alias{genefilterSample}
+\alias{getSamples}
+\alias{getSpecies}
+\alias{getTaxa}
+\alias{getVariable}
+\alias{import_qiime_sampleData}
+\alias{merge_species}
+\alias{nspecies}
+\alias{otuTable}
+\alias{otuTable<-}
+\alias{phyloseq-deprecated}
+\alias{phyloseq-deprecated-package}
+\alias{plot_richness_estimates}
+\alias{plot_taxa_bar}
+\alias{prune_species}
+\alias{rank.names}
+\alias{samData}
+\alias{sam_data}
+\alias{sam_data<-}
+\alias{sample.names}
+\alias{sample.variables}
+\alias{sampleData}
+\alias{sampleData<-}
+\alias{sampleNames}
+\alias{sampleSums}
+\alias{show_mothur_list_cutoffs}
+\alias{species.names}
+\alias{speciesAreRows}
+\alias{speciesAreRows<-}
+\alias{speciesSums}
+\alias{speciesarerows}
+\alias{subset_species}
+\alias{taxTab}
+\alias{taxTab<-}
+\alias{taxaplot}
+\alias{taxglom}
+\alias{taxtab}
+\alias{tipglom}
+\alias{tre}
+\alias{tre<-}
+\title{Depcrecated functions in the phyloseq package.}
+\usage{
+deprecated_phyloseq_function(x, value, ...)
+}
+\arguments{
+\item{x}{For assignment operators, the object that will undergo a replacement
+(object inside parenthesis).}
+
+\item{value}{For assignment operators, the value to replace with
+(the right side of the assignment).}
+
+\item{...}{For functions other than assignment operators,
+parameters to be passed to the modern version of the function (see table).}
+}
+\description{
+These will be migrated to \code{"defunct"} status in the next release,
+and removed completely in the release after that.
+These functions are provided for compatibility with older version of
+the phyloseq package. They may eventually be completely
+removed.
+}
+\details{
+\tabular{rl}{
+ \code{plot_taxa_bar} \tab now a synonym for \code{\link{plot_bar}}\cr
+ \code{taxaplot} \tab now a synonym for \code{\link{plot_bar}}\cr
+ \code{taxtab} \tab now a synonym for \code{\link{tax_table}}\cr
+ \code{taxTab} \tab now a synonym for \code{\link{tax_table}}\cr
+ \code{sampleData} \tab now a synonym for \code{\link{sample_data}}\cr
+ \code{samData} \tab now a synonym for \code{\link{sample_data}}\cr
+ \code{sam_data} \tab now a synonym for \code{\link{sample_data}}\cr
+ \code{speciesSums} \tab now a synonym for \code{\link{taxa_sums}}\cr
+ \code{sampleSums} \tab now a synonym for \code{\link{sample_sums}}\cr
+ \code{nspecies} \tab now a synonym for \code{\link{ntaxa}}\cr
+ \code{species.names} \tab now a synonym for \code{\link{taxa_names}}\cr
+ \code{sampleNames} \tab now a synonym for \code{\link{sample_names}}\cr
+ \code{sample.names} \tab now a synonym for \code{\link{sample_names}}\cr
+ \code{getSamples} \tab now a synonym for \code{\link{get_sample}}\cr
+ \code{getSpecies} \tab now a synonym for \code{\link{get_taxa}}\cr
+ \code{rank.names} \tab now a synonym for \code{\link{rank_names}}\cr
+ \code{getTaxa} \tab now a synonym for \code{\link{get_taxa_unique}}\cr
+ \code{sample.variables} \tab now a synonym for \code{\link{sample_variables}}\cr
+ \code{getVariable} \tab now a synonym for \code{\link{get_variable}}\cr
+ \code{merge_species} \tab now a synonym for \code{\link{merge_taxa}}\cr
+ \code{otuTable} \tab now a synonym for \code{\link{otu_table}}\cr
+ \code{speciesarerows} \tab now a synonym for \code{\link{taxa_are_rows}}\cr
+ \code{speciesAreRows} \tab now a synonym for \code{\link{taxa_are_rows}}\cr
+ \code{plot_richness_estimates} \tab now a synonym for \code{\link{plot_richness}}\cr
+ \code{import_qiime_sampleData} \tab now a synonym for \code{\link{import_qiime_sample_data}}\cr
+ \code{filterfunSample} \tab now a synonym for \code{\link{filterfun_sample}}\cr
+ \code{genefilterSample} \tab now a synonym for \code{\link{genefilter_sample}}\cr
+ \code{prune_species} \tab now a synonym for \code{\link{prune_taxa}}\cr
+ \code{subset_species} \tab now a synonym for \code{\link{subset_taxa}}\cr
+ \code{tipglom} \tab now a synonym for \code{\link{tip_glom}}\cr
+ \code{taxglom} \tab now a synonym for \code{\link{tax_glom}}\cr
+ \code{tre} \tab now a synonym for \code{\link{phy_tree}}\cr
+ \code{show_mothur_list_cutoffs} \tab now a synonym for \code{\link{show_mothur_cutoffs}}\cr
+ \code{sam_data<-} \tab now a synonym for \code{\link{sample_data<-}}\cr
+ \code{sampleData<-} \tab now a synonym for \code{\link{sample_data<-}}\cr
+ \code{tre<-} \tab now a synonym for \code{\link{phy_tree<-}}\cr
+ \code{speciesAreRows<-} \tab now a synonym for \code{\link{taxa_are_rows<-}}\cr
+ \code{otuTable<-} \tab now a synonym for \code{\link{otu_table<-}}\cr
+ \code{taxTab<-} \tab now a synonym for \code{\link{tax_table<-}}\cr
+}
+}
+
diff --git a/man/phyloseq-package.Rd b/man/phyloseq-package.Rd
new file mode 100644
index 0000000..58de586
--- /dev/null
+++ b/man/phyloseq-package.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/allPackage.R
+\docType{package}
+\name{phyloseq-package}
+\alias{phyloseq-package}
+\title{Handling and analysis of high-throughput phylogenetic sequence data.}
+\description{
+There are already several ecology and phylogenetic packages available in R,
+including the adephylo, vegan, ade4, picante, ape, phangorn, phylobase, and OTUbase packages.
+These can already take advantage of many of the powerful statistical and graphics tools
+available in R. However, prior to \emph{phyloseq} a user must devise their own methods
+for parsing the output of their favorite OTU clustering application, and, as a consequence,
+there is also no standard within Bioconductor (or R generally) for storing or sharing the
+suite of related data objects that describe a phylogenetic sequencing project.
+The phyloseq package seeks to address these issues by providing a related set of S4 classes
+that internally manage the handling tasks associated with organizing, linking, storing,
+and analyzing phylogenetic sequencing data. \emph{phyloseq} additionally provides some
+convenience wrappers for input from common clustering applications, common analysis pipelines,
+and native implementation of methods that are not available in other R packages.
+}
+\author{
+Paul J. McMurdie II \email{mcmurdie at stanford.edu}
+}
+\references{
+\url{www.stanford.edu/~mcmurdie}
+}
+\keyword{package}
+
diff --git a/man/phyloseq.Rd b/man/phyloseq.Rd
new file mode 100644
index 0000000..154f3c7
--- /dev/null
+++ b/man/phyloseq.Rd
@@ -0,0 +1,49 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/phyloseq-class.R
+\name{phyloseq}
+\alias{phyloseq}
+\title{Build phyloseq-class objects from their components.}
+\usage{
+phyloseq(...)
+}
+\arguments{
+\item{...}{One or more component objects among the set of classes
+defined by the phyloseq package, as well as \code{phylo}-class
+(defined by the \code{\link{ape-package}}). Each argument should be a different class.
+For combining multiple components of the same class, or multiple phyloseq-class
+objects, use the \code{\link{merge_phyloseq}} function. Unlike in earlier
+versions, the arguments to phyloseq do not need to be named, and the order
+of the arguments does not matter.}
+}
+\value{
+The class of the returned object depends on the argument
+ class(es). For an experiment-level object, two or more component data objects
+ must be provided.
+ Otherwise, if a single component-class
+ is provided, it is simply returned as-is.
+ The order of arguments does not matter.
+}
+\description{
+\code{phyloseq()} is a constructor method, This is the main method
+suggested for constructing an experiment-level (\code{\link{phyloseq-class}})
+object from its component data
+(component data classes: \code{\link{otu_table-class}}, \code{\link{sample_data-class}},
+ \code{\link{taxonomyTable-class}}, \code{\link{phylo-class}}).
+}
+\examples{
+data(esophagus)
+x1 = phyloseq(otu_table(esophagus), phy_tree(esophagus))
+identical(x1, esophagus)
+# # data(GlobalPatterns)
+# # GP <- GlobalPatterns
+# # phyloseq(sample_data(GP), otu_table(GP))
+# # phyloseq(otu_table(GP), phy_tree(GP))
+# # phyloseq(tax_table(GP), otu_table(GP))
+# # phyloseq(phy_tree(GP), otu_table(GP), sample_data(GP))
+# # phyloseq(otu_table(GP), tax_table(GP), sample_data(GP))
+# # phyloseq(otu_table(GP), phy_tree(GP), tax_table(GP), sample_data(GP))
+}
+\seealso{
+\code{\link{merge_phyloseq}}
+}
+
diff --git a/man/phyloseq_to_deseq2.Rd b/man/phyloseq_to_deseq2.Rd
new file mode 100644
index 0000000..406697c
--- /dev/null
+++ b/man/phyloseq_to_deseq2.Rd
@@ -0,0 +1,62 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/extend_DESeq2.R
+\name{phyloseq_to_deseq2}
+\alias{phyloseq_to_deseq2}
+\title{Convert phyloseq data to DESeq2 dds object}
+\usage{
+phyloseq_to_deseq2(physeq, design, ...)
+}
+\arguments{
+\item{physeq}{(Required). \code{\link{phyloseq-class}}.
+Must have a \code{\link{sample_data}} component.}
+
+\item{design}{(Required). A \code{\link{formula}} which specifies the design of the experiment,
+taking the form \code{formula(~ x + y + z)}. That is, a formula with right-hand side only.
+By default, the functions in this package and DESeq2
+will use the last variable in the formula (e.g. \code{z})
+for presenting results (fold changes, etc.) and plotting.
+When considering your specification of experimental design, you will want to
+re-order the levels so that the \code{NULL} set is first.
+For example, the following line of code would ensure that Enterotype 1 is used as the
+reference sample class in tests by setting it to the first of the factor levels
+using the \code{\link{relevel}} function:
+
+\code{sample_data(entill)$Enterotype <- relevel(sample_data(entill)$Enterotype, "1")}}
+
+\item{...}{(Optional). Additional named arguments passed to \code{\link[DESeq2]{DESeqDataSetFromMatrix}}.
+Most users will not need to pass any additional arguments here.
+Most testing-related options should be provided in
+a following call to \code{\link[DESeq2]{DESeq}}.}
+}
+\value{
+A \code{\link[DESeq2]{DESeqDataSet}} object.
+}
+\description{
+No testing is performed by this function. The phyloseq data is converted
+to the relevant \code{\link[DESeq2]{DESeqDataSet}} object, which can then be
+tested in the negative binomial generalized linear model framework
+of the \code{\link[DESeq2]{DESeq}} function in DESeq2 package.
+See the
+\href{http://joey711.github.io/phyloseq-extensions}{phyloseq-extensions}
+tutorials for more details.
+}
+\examples{
+ # Check out the vignette phyloseq-mixture-models for more details.
+ # vignette("phyloseq-mixture-models")
+ data(soilrep)
+ phyloseq_to_deseq2(soilrep, ~warmed)
+}
+\seealso{
+\code{vignette("phyloseq-mixture-models")}
+
+The
+\href{http://joey711.github.io/phyloseq-extensions}{phyloseq-extensions}
+tutorials.
+
+ \code{\link[DESeq2]{DESeq}}
+
+ \code{\link[DESeq2]{results}}
+
+ \code{\link[DESeq2]{DESeqDataSetFromMatrix}}
+}
+
diff --git a/man/phyloseq_to_metagenomeSeq.Rd b/man/phyloseq_to_metagenomeSeq.Rd
new file mode 100644
index 0000000..a0f6c9d
--- /dev/null
+++ b/man/phyloseq_to_metagenomeSeq.Rd
@@ -0,0 +1,42 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/extend_metagenomeSeq.R
+\name{phyloseq_to_metagenomeSeq}
+\alias{phyloseq_to_metagenomeSeq}
+\title{Convert phyloseq data to MetagenomeSeq MRexperiment object}
+\usage{
+phyloseq_to_metagenomeSeq(physeq, ...)
+}
+\arguments{
+\item{physeq}{(Required). \code{\link{phyloseq-class}}.}
+
+\item{...}{(Optional). Additional named arguments passed
+to \code{\link[metagenomeSeq]{newMRexperiment}}.
+Most users will not need to pass any additional arguments here.}
+}
+\value{
+A \code{\link[metagenomeSeq]{MRexperiment-class}} object.
+}
+\description{
+No testing is performed by this function. The phyloseq data is converted
+to the relevant \code{\link[metagenomeSeq]{MRexperiment-class}} object, which can then be
+tested in the zero-inflated mixture model framework
+(e.g. \code{\link[metagenomeSeq]{fitZig}})
+in the metagenomeSeq package.
+See the
+\href{http://joey711.github.io/phyloseq-extensions}{phyloseq-extensions}
+tutorials for more details.
+}
+\examples{
+ # Check out the vignette metagenomeSeq for more details.
+ # vignette("metagenomeSeq")
+ data(soilrep)
+ phyloseq_to_metagenomeSeq(soilrep)
+}
+\seealso{
+\code{\link[metagenomeSeq]{fitTimeSeries}}
+ \code{\link[metagenomeSeq]{fitLogNormal}}
+ \code{\link[metagenomeSeq]{fitZig}}
+ \code{\link[metagenomeSeq]{MRtable}}
+ \code{\link[metagenomeSeq]{MRfulltable}}
+}
+
diff --git a/man/plot_bar.Rd b/man/plot_bar.Rd
new file mode 100644
index 0000000..30fb997
--- /dev/null
+++ b/man/plot_bar.Rd
@@ -0,0 +1,74 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/plot-methods.R
+\name{plot_bar}
+\alias{plot_bar}
+\title{A flexible, informative barplot phyloseq data}
+\usage{
+plot_bar(physeq, x="Sample", y="Abundance", fill=NULL,
+ title=NULL, facet_grid=NULL)
+}
+\arguments{
+\item{physeq}{(Required). An \code{\link{otu_table-class}} or
+\code{\link{phyloseq-class}}.}
+
+\item{x}{(Optional). Optional, but recommended, especially if your data
+is comprised of many samples. A character string.
+The variable in the melted-data that should be mapped to the x-axis.
+See \code{\link{psmelt}}, \code{\link{melt}},
+and \code{\link{ggplot}} for more details.}
+
+\item{y}{(Optional). A character string.
+The variable in the melted-data that should be mapped to the y-axis.
+Typically this will be \code{"Abundance"}, in order to
+quantitatively display the abundance values for each OTU/group.
+However, alternative variables could be used instead,
+producing a very different, though possibly still informative, plot.
+See \code{\link{psmelt}}, \code{\link{melt}},
+and \code{\link{ggplot}} for more details.}
+
+\item{fill}{(Optional). A character string. Indicates which sample variable
+should be used to map to the fill color of the bars.
+The default is \code{NULL}, resulting in a gray fill for all bar segments.}
+
+\item{title}{(Optional). Default \code{NULL}. Character string.
+The main title for the graphic.}
+
+\item{facet_grid}{(Optional). A formula object.
+It should describe the faceting you want in exactly the same way as for
+\code{\link[ggplot2]{facet_grid}},
+and is ulitmately provided to \code{\link{ggplot}}2 graphics.
+The default is: \code{NULL}, resulting in no faceting.}
+}
+\value{
+A \code{\link[ggplot2]{ggplot}}2 graphic object -- rendered in the graphical device
+ as the default \code{\link[base]{print}}/\code{\link[methods]{show}} method.
+}
+\description{
+There are many useful examples of phyloseq barplot graphics in the
+\href{http://joey711.github.io/phyloseq/plot_bar-examples}{phyloseq online tutorials}.
+This function wraps \code{ggplot2} plotting, and returns a \code{ggplot2}
+ graphic object
+that can be saved or further modified with additional layers, options, etc.
+The main purpose of this function is to quickly and easily create informative
+summary graphics of the differences in taxa abundance between samples in
+an experiment.
+}
+\examples{
+data("GlobalPatterns")
+gp.ch = subset_taxa(GlobalPatterns, Phylum == "Chlamydiae")
+plot_bar(gp.ch)
+plot_bar(gp.ch, fill="Genus")
+plot_bar(gp.ch, x="SampleType", fill="Genus")
+plot_bar(gp.ch, "SampleType", fill="Genus", facet_grid=~Family)
+# See additional examples in the plot_bar online tutorial. Link above.
+}
+\seealso{
+\href{http://joey711.github.io/phyloseq/plot_bar-examples}{phyloseq online tutorials}.
+
+ \code{\link{psmelt}}
+
+ \code{\link{ggplot}}
+
+ \code{\link{qplot}}
+}
+
diff --git a/man/plot_clusgap.Rd b/man/plot_clusgap.Rd
new file mode 100644
index 0000000..1fa618e
--- /dev/null
+++ b/man/plot_clusgap.Rd
@@ -0,0 +1,59 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/plot-methods.R
+\name{plot_clusgap}
+\alias{plot_clusgap}
+\title{Create a ggplot summary of gap statistic results}
+\usage{
+plot_clusgap(clusgap, title = "Gap Statistic results")
+}
+\arguments{
+\item{clusgap}{(Required).
+An object of S3 class \code{"clusGap"}, basically a list with components.
+See the \code{\link[cluster]{clusGap}} documentation for more details.
+In most cases this will be the output of \code{\link{gapstat_ord}},
+or \code{\link[cluster]{clusGap}} if you called it directly.}
+
+\item{title}{(Optional). Character string.
+The main title for the graphic.
+Default is \code{"Gap Statistic results"}.}
+}
+\value{
+A \code{\link[ggplot2]{ggplot}} plot object.
+The rendered graphic should be a plot of the gap statistic score
+versus values for \code{k}, the number of clusters.
+}
+\description{
+Create a ggplot summary of gap statistic results
+}
+\examples{
+# Load and process data
+data("soilrep")
+soilr = rarefy_even_depth(soilrep, rngseed=888)
+print(soilr)
+sample_variables(soilr)
+# Ordination
+sord = ordinate(soilr, "DCA")
+# Gap Statistic
+gs = gapstat_ord(sord, axes=1:4, verbose=FALSE)
+# Evaluate results with plots, etc.
+plot_scree(sord)
+plot_ordination(soilr, sord, color="Treatment")
+plot_clusgap(gs)
+print(gs, method="Tibs2001SEmax")
+# Non-ordination example, use cluster::clusGap function directly
+library("cluster")
+pam1 = function(x, k){list(cluster = pam(x, k, cluster.only=TRUE))}
+gs.pam.RU = clusGap(ruspini, FUN = pam1, K.max = 8, B = 60)
+gs.pam.RU
+plot(gs.pam.RU, main = "Gap statistic for the 'ruspini' data")
+mtext("k = 4 is best .. and k = 5 pretty close")
+plot_clusgap(gs.pam.RU)
+}
+\seealso{
+\code{\link{gapstat_ord}}
+
+\code{\link[cluster]{clusGap}}
+
+\code{\link[ggplot2]{ggplot}}
+}
+
diff --git a/man/plot_heatmap.Rd b/man/plot_heatmap.Rd
new file mode 100644
index 0000000..a32337a
--- /dev/null
+++ b/man/plot_heatmap.Rd
@@ -0,0 +1,206 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/plot-methods.R
+\name{plot_heatmap}
+\alias{plot_heatmap}
+\title{Create an ecologically-organized heatmap using ggplot2 graphics}
+\usage{
+plot_heatmap(physeq, method = "NMDS", distance = "bray",
+ sample.label = NULL, taxa.label = NULL, low = "#000033",
+ high = "#66CCFF", na.value = "black", trans = log_trans(4),
+ max.label = 250, title = NULL, sample.order = NULL, taxa.order = NULL,
+ first.sample = NULL, first.taxa = NULL, ...)
+}
+\arguments{
+\item{physeq}{(Required). The data, in the form of an instance of the
+\code{\link{phyloseq-class}}. This should be what you get as a result
+from one of the
+\code{\link{import}} functions, or any of the processing downstream.
+No data components beyond the \code{\link{otu_table}} are strictly
+necessary, though they may be useful if you want to re-label the
+axis ticks according to some observable or taxonomic rank, for instance,
+or if you want to use a \code{\link{UniFrac}}-based distance
+(in which case your \code{physeq} data would need to have a tree included).}
+
+\item{method}{(Optional).
+The ordination method to use for organizing the
+heatmap. A great deal of the usefulness of a heatmap graphic depends upon
+the way in which the rows and columns are ordered.}
+
+\item{distance}{(Optional). A character string.
+The ecological distance method to use in the ordination.
+See \code{\link{distance}}.}
+
+\item{sample.label}{(Optional). A character string.
+The sample variable by which you want to re-label the sample (horizontal) axis.}
+
+\item{taxa.label}{(Optional). A character string.
+The name of the taxonomic rank by which you want to
+re-label the taxa/species/OTU (vertical) axis.
+You can see available options in your data using
+\code{\link{rank_names}(physeq)}.}
+
+\item{low}{(Optional). A character string. An R color.
+See \code{?\link{colors}} for options support in R (there are lots).
+The color that represents the lowest non-zero value
+in the heatmap. Default is a dark blue color, \code{"#000033"}.}
+
+\item{high}{(Optional). A character string. An R color.
+See \code{\link{colors}} for options support in R (there are lots).
+The color that will represent the highest
+value in the heatmap. The default is \code{"#66CCFF"}.
+Zero-values are treated as \code{NA}, and set to \code{"black"}, to represent
+a background color.}
+
+\item{na.value}{(Optional). A character string. An R color.
+See \code{\link{colors}} for options support in R (there are lots).
+The color to represent what is essentially the background of the plot,
+the non-observations that occur as \code{NA} or
+\code{0} values in the abundance table. The default is \code{"black"}, which
+works well on computer-screen graphics devices, but may be a poor choice for
+printers, in which case you might want this value to be \code{"white"}, and
+reverse the values of \code{high} and \code{low}, above.}
+
+\item{trans}{(Optional). \code{"trans"}-class transformer-definition object.
+A numerical transformer to use in
+the continuous color scale. See \code{\link[scales]{trans_new}} for details.
+The default is \code{\link{log_trans}(4)}.}
+
+\item{max.label}{(Optional). Integer. Default is 250.
+ The maximum number of labeles to fit on a given axis (either x or y).
+ If number of taxa or samples exceeds this value,
+ the corresponding axis will be stripped of any labels.
+
+ This supercedes any arguments provided to
+ \code{sample.label} or \code{taxa.label}.
+ Make sure to increase this value if, for example,
+ you want a special label
+ for an axis that has 300 indices.}
+
+\item{title}{(Optional). Default \code{NULL}. Character string.
+The main title for the graphic.}
+
+\item{sample.order}{(Optional). Default \code{NULL}.
+Either a single character string matching
+one of the \code{\link{sample_variables}} in your data,
+or a character vector of \code{\link{sample_names}}
+in the precise order that you want them displayed in the heatmap.
+This overrides any ordination ordering that might be done
+with the \code{method}/\code{distance} arguments.}
+
+\item{taxa.order}{(Optional). Default \code{NULL}.
+Either a single character string matching
+one of the \code{\link{rank_names}} in your data,
+or a character vector of \code{\link{taxa_names}}
+in the precise order that you want them displayed in the heatmap.
+This overrides any ordination ordering that might be done
+with the \code{method}/\code{distance} arguments.}
+
+\item{first.sample}{(Optional). Default \code{NULL}.
+A character string matching one of the \code{\link{sample_names}}
+of your input data (\code{physeq}).
+It will become the left-most sample in the plot.
+For the ordination-based ordering (recommended),
+the left and right edges of the axes are adjaacent in a continuous ordering.
+Therefore, the choice of starting sample is meaningless and arbitrary,
+but it is aesthetically poor to have the left and right edge split
+a natural cluster in the data.
+This argument allows you to specify the left edge
+and thereby avoid cluster-splitting, emphasize a gradient, etc.}
+
+\item{first.taxa}{(Optional). Default \code{NULL}.
+A character string matching one of the \code{\link{taxa_names}}
+of your input data (\code{physeq}).
+This is equivalent to \code{first.sample} (above),
+but for the taxa/OTU indices, usually the vertical axis.}
+
+\item{...}{(Optional). Additional parameters passed to \code{\link{ordinate}}.}
+}
+\value{
+A heatmap plot, in the form of a \code{\link{ggplot}2} plot object,
+ which can be further saved and modified.
+}
+\description{
+There are many useful examples of phyloseq heatmap graphics in the
+\href{http://joey711.github.io/phyloseq/plot_heatmap-examples}{phyloseq online tutorials}.
+In a 2010 article in BMC Genomics, Rajaram and Oono show describe an
+approach to creating a heatmap using ordination methods to organize the
+rows and columns instead of (hierarchical) cluster analysis. In many cases
+the ordination-based ordering does a much better job than h-clustering.
+An immediately useful example of their approach is provided in the NeatMap
+package for R. The NeatMap package can be used directly on the abundance
+table (\code{\link{otu_table-class}}) of phylogenetic-sequencing data, but
+the NMDS or PCA ordination options that it supports are not based on ecological
+distances. To fill this void, phyloseq provides the \code{plot_heatmap()}
+function as an ecology-oriented variant of the NeatMap approach to organizing
+a heatmap and build it using ggplot2 graphics tools.
+The \code{distance} and \code{method} arguments are the same as for the
+\code{\link{plot_ordination}} function, and support large number of
+distances and ordination methods, respectively, with a strong leaning toward
+ecology.
+This function also provides the options to re-label the OTU and sample
+axis-ticks with a taxonomic name and/or sample variable, respectively,
+in the hope that this might hasten your interpretation of the patterns
+(See the \code{sample.label} and \code{taxa.label} documentation, below).
+Note that this function makes no attempt to overlay hierarchical
+clustering trees on the axes, as hierarchical clustering is not used to
+organize the plot. Also note that each re-ordered axis repeats at the edge,
+and so apparent clusters at the far right/left or top/bottom of the
+heat-map may actually be the same. For now, the placement of this edge
+can be considered arbitrary, so beware of this artifact of this graphical
+representation. If you benefit from this phyloseq-specific implementation
+of the NeatMap approach, please cite both our packages/articles.
+}
+\details{
+This approach borrows heavily from the \code{heatmap1} function in the
+\code{NeatMap} package. Highly recommended, and we are grateful for their
+package and ideas, which we have adapted for our specific purposes here,
+but did not use an explicit dependency. At the time of the first version
+of this implementation, the NeatMap package depends on the rgl-package,
+which is not needed in phyloseq, at present. Although likely a transient
+issue, the rgl-package has some known installation issues that have further
+influenced to avoid making NeatMap a formal dependency (Although we love
+both NeatMap and rgl!).
+}
+\examples{
+data("GlobalPatterns")
+gpac <- subset_taxa(GlobalPatterns, Phylum=="Crenarchaeota")
+# FYI, the base-R function uses a non-ecological ordering scheme,
+# but does add potentially useful hclust dendrogram to the sides...
+gpac <- subset_taxa(GlobalPatterns, Phylum=="Crenarchaeota")
+# Remove the nearly-empty samples (e.g. 10 reads or less)
+gpac = prune_samples(sample_sums(gpac) > 50, gpac)
+# Arbitrary order if method set to NULL
+plot_heatmap(gpac, method=NULL, sample.label="SampleType", taxa.label="Family")
+# Use ordination
+plot_heatmap(gpac, sample.label="SampleType", taxa.label="Family")
+# Use ordination for OTUs, but not sample-order
+plot_heatmap(gpac, sample.label="SampleType", taxa.label="Family", sample.order="SampleType")
+# Specifying both orders omits any attempt to use ordination. The following should be the same.
+p0 = plot_heatmap(gpac, sample.label="SampleType", taxa.label="Family", taxa.order="Phylum", sample.order="SampleType")
+p1 = plot_heatmap(gpac, method=NULL, sample.label="SampleType", taxa.label="Family", taxa.order="Phylum", sample.order="SampleType")
+#expect_equivalent(p0, p1)
+# Example: Order matters. Random ordering of OTU indices is difficult to interpret, even with structured sample order
+rando = sample(taxa_names(gpac), size=ntaxa(gpac), replace=FALSE)
+plot_heatmap(gpac, method=NULL, sample.label="SampleType", taxa.label="Family", taxa.order=rando, sample.order="SampleType")
+# # Select the edges of each axis.
+# First, arbitrary edge, ordering
+plot_heatmap(gpac, method=NULL)
+# Second, biological-ordering (instead of default ordination-ordering), but arbitrary edge
+plot_heatmap(gpac, taxa.order="Family", sample.order="SampleType")
+# Third, biological ordering, selected edges
+plot_heatmap(gpac, taxa.order="Family", sample.order="SampleType", first.taxa="546313", first.sample="NP2")
+# Fourth, add meaningful labels
+plot_heatmap(gpac, sample.label="SampleType", taxa.label="Family", taxa.order="Family", sample.order="SampleType", first.taxa="546313", first.sample="NP2")
+}
+\references{
+Because this function relies so heavily in principle, and in code, on some of the
+ functionality in NeatMap, please site their article if you use this function
+ in your work.
+
+ Rajaram, S., & Oono, Y. (2010).
+ NeatMap--non-clustering heat map alternatives in R. BMC Bioinformatics, 11, 45.
+
+Please see further examples in the
+\href{http://joey711.github.io/phyloseq/plot_heatmap-examples}{phyloseq online tutorials}.
+}
+
diff --git a/man/plot_net.Rd b/man/plot_net.Rd
new file mode 100644
index 0000000..bb8aa0c
--- /dev/null
+++ b/man/plot_net.Rd
@@ -0,0 +1,117 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/plot-methods.R
+\name{plot_net}
+\alias{plot_net}
+\title{Microbiome Network Plot using ggplot2}
+\usage{
+plot_net(physeq, distance = "bray", type = "samples", maxdist = 0.7,
+ laymeth = "fruchterman.reingold", color = NULL, shape = NULL,
+ rescale = FALSE, point_size = 5, point_alpha = 1, point_label = NULL,
+ hjust = 1.35, title = NULL)
+}
+\arguments{
+\item{physeq}{(Required).
+The \code{\link{phyloseq-class}} object that you want to represent as a network.}
+
+\item{distance}{(Optional). Default is \code{"bray"}.
+Can be either a distance method supported by \code{\link[phyloseq]{distance}},
+or an already-computed \code{\link{dist}}-class with labels that match
+the indices implied by both the \code{physeq} and \code{type} arguments
+(that is, either sample or taxa names).
+If you used \code{\link[phyloseq]{distance}} to pre-calculate your \code{\link{dist}}ance,
+and the same \code{type} argument as provided here, then they will match.}
+
+\item{type}{(Optional). Default \code{"samples"}.
+Whether the network represented in the primary argument, \code{g},
+is samples or taxa/OTUs.
+Supported arguments are \code{"samples"}, \code{"taxa"},
+where \code{"taxa"} indicates using the taxa indices,
+whether they actually represent species or some other taxonomic rank.}
+
+\item{maxdist}{(Optional). Default \code{0.7}.
+The maximum distance value between two vertices
+to connect with an edge in the graphic.}
+
+\item{laymeth}{(Optional). Default \code{"fruchterman.reingold"}.
+A character string that indicates the method that will determine
+the placement of vertices, typically based on conectedness of vertices
+and the number of vertices.
+This is an interesting topic, and there are lots of options.
+See \code{\link{igraph-package}} for related topics in general,
+and see \code{\link[igraph]{layout.auto}} for descriptions of various
+alternative layout method options supported here.
+The character string argument should match exactly the
+layout function name with the \code{"layout."} omitted.
+Try \code{laymeth="list"} to see a list of options.}
+
+\item{color}{(Optional). Default \code{NULL}.
+The name of the sample variable in \code{physeq} to use for color mapping
+of points (graph vertices).}
+
+\item{shape}{(Optional). Default \code{NULL}.
+The name of the sample variable in \code{physeq} to use for shape mapping.
+of points (graph vertices).}
+
+\item{rescale}{(Optional). Logical. Default \code{FALSE}.
+Whether to rescale the distance values to be \code{[0, 1]}, in which the
+min value is close to zero and the max value is 1.}
+
+\item{point_size}{(Optional). Default \code{5}.
+The size of the vertex points.}
+
+\item{point_alpha}{(Optional). Default \code{1}.
+A value between 0 and 1 for the alpha transparency of the vertex points.}
+
+\item{point_label}{(Optional). Default \code{NULL}.
+The variable name in \code{physeq} covariate data to map to vertex labels.}
+
+\item{hjust}{(Optional). Default \code{1.35}.
+The amount of horizontal justification to use for each label.}
+
+\item{title}{(Optional). Default \code{NULL}. Character string.
+The main title for the graphic.}
+}
+\value{
+A \code{\link{ggplot}}2 network plot.
+ Will render to default graphic device automatically as print side effect.
+ Can also be saved, further manipulated, or rendered to
+ a vector or raster file using \code{\link{ggsave}}.
+}
+\description{
+There are many useful examples of phyloseq network graphics in the
+\href{http://joey711.github.io/phyloseq/plot_net-examples}{phyloseq online tutorials}.
+A custom plotting function for displaying networks
+using advanced \code{\link[ggplot2]{ggplot}}2 formatting.
+Note that this function is a performance and interface revision to
+\code{\link{plot_network}}, which requires an \code{\link[igraph]{igraph}}
+object as its first argument.
+This new function is more in-line with other
+\code{plot_*} functions in the \code{\link{phyloseq-package}}, in that its
+first/main argument is a \code{\link{phyloseq-class}} instance.
+Edges in the network are created if the distance between
+nodes is below a (potentially arbitrary) threshold,
+and special care should be given to considering the choice of this threshold.
+However, network line thickness and opacity is scaled according to the
+similarity of vertices (either samples or taxa),
+helping to temper, somewhat, the effect of the threshold.
+Also note that the choice of network layout algorithm can have a large effect
+on the impression and interpretability of the network graphic,
+and you may want to familiarize yourself with some of these options
+(see the \code{laymeth} argument).
+}
+\examples{
+data(enterotype)
+plot_net(enterotype, color="SeqTech", maxdist = 0.3)
+plot_net(enterotype, color="SeqTech", maxdist = 0.3, laymeth = "auto")
+plot_net(enterotype, color="SeqTech", maxdist = 0.3, laymeth = "svd")
+plot_net(enterotype, color="SeqTech", maxdist = 0.3, laymeth = "circle")
+plot_net(enterotype, color="SeqTech", shape="Enterotype", maxdist = 0.3, laymeth = "circle")
+}
+\seealso{
+Original network plotting functions:
+
+ \code{\link{make_network}}
+
+ \code{\link{plot_network}}
+}
+
diff --git a/man/plot_network.Rd b/man/plot_network.Rd
new file mode 100644
index 0000000..fc2a96f
--- /dev/null
+++ b/man/plot_network.Rd
@@ -0,0 +1,112 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/plot-methods.R
+\name{plot_network}
+\alias{plot_network}
+\title{Microbiome Network Plot using ggplot2}
+\usage{
+plot_network(g, physeq=NULL, type="samples",
+ color=NULL, shape=NULL, point_size=4, alpha=1,
+ label="value", hjust = 1.35,
+ line_weight=0.5, line_color=color, line_alpha=0.4,
+ layout.method=layout.fruchterman.reingold, title=NULL)
+}
+\arguments{
+\item{g}{(Required). An \code{igraph}-class object created
+either by the convenience wrapper \code{\link{make_network}},
+or directly by the tools in the igraph-package.}
+
+\item{physeq}{(Optional). Default \code{NULL}.
+A \code{\link{phyloseq-class}} object on which \code{g} is based.}
+
+\item{type}{(Optional). Default \code{"samples"}.
+Whether the network represented in the primary argument, \code{g},
+is samples or taxa/OTUs.
+Supported arguments are \code{"samples"}, \code{"taxa"},
+where \code{"taxa"} indicates using the taxa indices,
+whether they actually represent species or some other taxonomic rank.}
+
+\item{color}{(Optional). Default \code{NULL}.
+The name of the sample variable in \code{physeq} to use for color mapping
+of points (graph vertices).}
+
+\item{shape}{(Optional). Default \code{NULL}.
+The name of the sample variable in \code{physeq} to use for shape mapping.
+of points (graph vertices).}
+
+\item{point_size}{(Optional). Default \code{4}.
+The size of the vertex points.}
+
+\item{alpha}{(Optional). Default \code{1}.
+A value between 0 and 1 for the alpha transparency of the vertex points.}
+
+\item{label}{(Optional). Default \code{"value"}.
+The name of the sample variable in \code{physeq} to use for
+labelling the vertex points.}
+
+\item{hjust}{(Optional). Default \code{1.35}.
+The amount of horizontal justification to use for each label.}
+
+\item{line_weight}{(Optional). Default \code{0.3}.
+The line thickness to use to label graph edges.}
+
+\item{line_color}{(Optional). Default \code{color}.
+The name of the sample variable in \code{physeq} to use for color mapping
+of lines (graph edges).}
+
+\item{line_alpha}{(Optional). Default \code{0.4}.
+The transparency level for graph-edge lines.}
+
+\item{layout.method}{(Optional). Default \code{layout.fruchterman.reingold}.
+A function (closure) that determines the placement of the vertices
+for drawing a graph. Should be able to take an \code{igraph}-class
+as sole argument, and return a two-column coordinate matrix with \code{nrow}
+equal to the number of vertices. For possible options already included in
+\code{igraph}-package, see the others also described in the help file:}
+
+\item{title}{(Optional). Default \code{NULL}. Character string.
+ The main title for the graphic.
+
+\code{\link[igraph]{layout.fruchterman.reingold}}}
+}
+\value{
+A \code{\link{ggplot}}2 plot representing the network,
+ with optional mapping of variable(s) to point color or shape.
+}
+\description{
+There are many useful examples of phyloseq network graphics in the
+\href{http://joey711.github.io/phyloseq/plot_network-examples}{phyloseq online tutorials}.
+A custom plotting function for displaying networks
+using advanced \code{\link[ggplot2]{ggplot}}2 formatting.
+The network itself should be represented using
+the \code{igraph} package.
+For the \code{\link{phyloseq-package}} it is suggested that the network object
+(argument \code{g})
+be created using the
+ \code{\link{make_network}} function,
+and based upon sample-wise or taxa-wise microbiome ecological distances
+calculated from a phylogenetic sequencing experiment
+(\code{\link{phyloseq-class}}).
+In this case, edges in the network are created if the distance between
+nodes is below a potentially arbitrary threshold,
+and special care should be given to considering the choice of this threshold.
+}
+\examples{
+
+data(enterotype)
+ig <- make_network(enterotype, max.dist=0.3)
+plot_network(ig, enterotype, color="SeqTech", shape="Enterotype", line_weight=0.3, label=NULL)
+# Change distance parameter
+ig <- make_network(enterotype, max.dist=0.2)
+plot_network(ig, enterotype, color="SeqTech", shape="Enterotype", line_weight=0.3, label=NULL)
+}
+\references{
+This code was adapted from a repo original hosted on GitHub by Scott Chamberlain:
+ \url{https://github.com/SChamberlain/gggraph}
+
+ The code most directly used/modified was first posted here:
+ \url{http://www.r-bloggers.com/basic-ggplot2-network-graphs/}
+}
+\seealso{
+\code{\link{make_network}}
+}
+
diff --git a/man/plot_ordination.Rd b/man/plot_ordination.Rd
new file mode 100644
index 0000000..86272b9
--- /dev/null
+++ b/man/plot_ordination.Rd
@@ -0,0 +1,114 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/plot-methods.R
+\name{plot_ordination}
+\alias{plot_ordination}
+\title{General ordination plotter based on ggplot2.}
+\usage{
+plot_ordination(physeq, ordination, type = "samples", axes = 1:2,
+ color = NULL, shape = NULL, label = NULL, title = NULL,
+ justDF = FALSE)
+}
+\arguments{
+\item{physeq}{(Required). \code{\link{phyloseq-class}}.
+The data about which you want to
+plot and annotate the ordination.}
+
+\item{ordination}{(Required). An ordination object. Many different classes
+of ordination are defined by \code{R} packages. Ordination classes
+currently supported/created by the \code{\link{ordinate}} function are
+supported here. There is no default, as the expectation is that the
+ordination will be performed and saved prior to calling this plot function.}
+
+\item{type}{(Optional). The plot type. Default is \code{"samples"}. The
+currently supported options are
+\code{c("samples", "sites", "species", "taxa", "biplot", "split", "scree")}.
+The option
+``taxa'' is equivalent to ``species'' in this case, and similarly,
+``samples'' is equivalent to ``sites''.
+The options
+\code{"sites"} and \code{"species"} result in a single-plot of just the
+sites/samples or species/taxa of the ordination, respectively.
+The \code{"biplot"} and \code{"split"} options result in a combined
+plot with both taxa and samples, either combined into one plot (``biplot'')
+or
+separated in two facet panels (``split''), respectively.
+The \code{"scree"} option results in a call to \code{\link{plot_scree}},
+which produces an ordered bar plot of the normalized eigenvalues
+associated with each ordination axis.}
+
+\item{axes}{(Optional). A 2-element vector indicating the axes of the
+ordination that should be used for plotting.
+Can be \code{\link{character-class}} or \code{\link{integer-class}},
+naming the index name or index of the desired axis for the horizontal
+and vertical axes, respectively, in that order. The default value,
+\code{c(1, 2)}, specifies the first two axes of the provided ordination.}
+
+\item{color}{(Optional). Default \code{NULL}. Character string.
+ The name of the variable to map to
+ colors in the plot.
+ This can be a sample variable
+ (among the set returned by \code{sample_variables(physeq)} )
+ or
+ taxonomic rank
+ (among the set returned by \code{rank_names(physeq)}).
+
+ Note that the color scheme is chosen automatically
+ by \code{link{ggplot}},
+ but it can be modified afterward with an additional layer using
+ \code{\link[ggplot2]{scale_color_manual}}.}
+
+\item{shape}{(Optional). Default \code{NULL}. Character string.
+ The name of the variable to map
+ to different shapes on the plot.
+ Similar to \code{color} option, but for the shape if points.
+
+ The shape scale is chosen automatically by \code{link{ggplot}},
+ but it can be modified afterward with an additional layer using
+ \code{\link[ggplot2]{scale_shape_manual}}.}
+
+\item{label}{(Optional). Default \code{NULL}. Character string.
+The name of the variable to map to text labels on the plot.
+Similar to \code{color} option, but for plotting text.}
+
+\item{title}{(Optional). Default \code{NULL}. Character string.
+The main title for the graphic.}
+
+\item{justDF}{(Optional). Default \code{FALSE}. Logical.
+Instead of returning a ggplot2-object, do you just want the relevant
+\code{data.frame} that was used to build the plot? This is a
+user-accessible option for obtaining the \code{data.frame}, in
+in principal to make a custom plot that isn't possible with the
+available options in this function. For contributing new functions
+(developers), the
+\code{\link{phyloseq-package}} provides/uses an internal function
+to build the key features of the \code{data.frame} prior to plot-build.}
+}
+\value{
+A \code{\link{ggplot}} plot object, graphically summarizing
+ the ordination result for the specified axes.
+}
+\description{
+There are many useful examples of phyloseq ordination graphics in the
+\href{http://joey711.github.io/phyloseq/plot_ordination-examples}{phyloseq online tutorials}.
+Convenience wrapper for plotting ordination results as a
+\code{ggplot2}-graphic, including
+additional annotation in the form of shading, shape, and/or labels of
+sample variables.
+}
+\examples{
+# See other examples at
+# http://joey711.github.io/phyloseq/plot_ordination-examples
+data(GlobalPatterns)
+GP = prune_taxa(names(sort(taxa_sums(GlobalPatterns), TRUE)[1:50]), GlobalPatterns)
+gp_bray_pcoa = ordinate(GP, "CCA", "bray")
+plot_ordination(GP, gp_bray_pcoa, "samples", color="SampleType")
+}
+\seealso{
+Many more examples are included in the
+ \href{http://joey711.github.io/phyloseq/plot_ordination-examples}{phyloseq online tutorials}.
+
+Also see the general wrapping function:
+
+\code{\link{plot_phyloseq}}
+}
+
diff --git a/man/plot_phyloseq-methods.Rd b/man/plot_phyloseq-methods.Rd
new file mode 100644
index 0000000..5cdf53a
--- /dev/null
+++ b/man/plot_phyloseq-methods.Rd
@@ -0,0 +1,48 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/plot-methods.R
+\docType{methods}
+\name{plot_phyloseq}
+\alias{plot_phyloseq}
+\alias{plot_phyloseq,phyloseq-method}
+\title{Generic plot defaults for phyloseq.}
+\usage{
+plot_phyloseq(physeq, ...)
+
+\S4method{plot_phyloseq}{phyloseq}(physeq, ...)
+}
+\arguments{
+\item{physeq}{(Required). \code{\link{phyloseq-class}}. The actual plot type
+depends on the available (non-empty) component data types contained within.}
+
+\item{...}{(Optional). Additional parameters to be passed on to the respective
+specific plotting function. See below for different plotting functions that
+might be called by this generic plotting wrapper.}
+}
+\value{
+A plot is created. The nature and class of the plot depends on
+ the \code{physeq} argument, specifically, which component data classes
+ are present.
+}
+\description{
+There are many useful examples of phyloseq graphics functions in the
+\href{http://joey711.github.io/phyloseq}{phyloseq online tutorials}.
+The specific plot type is chosen according to available non-empty slots.
+This is mainly for syntactic convenience and quick-plotting. See links below
+for some examples of available graphics tools available in the
+\code{\link{phyloseq-package}}.
+}
+\examples{
+data(esophagus)
+plot_phyloseq(esophagus)
+}
+\seealso{
+\href{http://joey711.github.io/phyloseq/tutorials-index.html}{phyloseq frontpage tutorials}.
+
+ \code{\link{plot_ordination}}
+ \code{\link{plot_heatmap}}
+ \code{\link{plot_tree}}
+ \code{\link{plot_network}}
+ \code{\link{plot_bar}}
+ \code{\link{plot_richness}}
+}
+
diff --git a/man/plot_richness.Rd b/man/plot_richness.Rd
new file mode 100644
index 0000000..f26ea3c
--- /dev/null
+++ b/man/plot_richness.Rd
@@ -0,0 +1,134 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/plot-methods.R
+\name{plot_richness}
+\alias{plot_richness}
+\title{Plot alpha diversity, flexibly with ggplot2}
+\usage{
+plot_richness(physeq, x = "samples", color = NULL, shape = NULL,
+ title = NULL, scales = "free_y", nrow = 1, shsi = NULL,
+ measures = NULL, sortby = NULL)
+}
+\arguments{
+\item{physeq}{(Required). \code{\link{phyloseq-class}}, or alternatively,
+an \code{\link{otu_table-class}}. The data about which you want to estimate.}
+
+\item{x}{(Optional). A variable to map to the horizontal axis. The vertical
+ axis will be mapped to the alpha diversity index/estimate
+ and have units of total taxa, and/or index value (dimensionless).
+ This parameter (\code{x}) can be either a character string indicating a
+ variable in \code{sample_data}
+ (among the set returned by \code{sample_variables(physeq)} );
+ or a custom supplied vector with length equal to the number of samples
+ in the dataset (nsamples(physeq)).
+
+ The default value is \code{"samples"}, which will map each sample's name
+ to a separate horizontal position in the plot.}
+
+\item{color}{(Optional). Default \code{NULL}.
+The sample variable to map to different colors.
+Like \code{x}, this can be a single character string of the variable name in
+\code{sample_data}
+(among the set returned by \code{sample_variables(physeq)} );
+or a custom supplied vector with length equal to the number of samples
+in the dataset (nsamples(physeq)).
+The color scheme is chosen automatically by \code{link{ggplot}},
+but it can be modified afterward with an additional layer using
+\code{\link[ggplot2]{scale_color_manual}}.}
+
+\item{shape}{(Optional). Default \code{NULL}. The sample variable to map
+to different shapes. Like \code{x} and \code{color},
+this can be a single character string
+of the variable name in
+\code{sample_data}
+(among the set returned by \code{sample_variables(physeq)} );
+or a custom supplied vector with length equal to the number of samples
+in the dataset (nsamples(physeq)).
+The shape scale is chosen automatically by \code{link{ggplot}},
+but it can be modified afterward with an additional layer using
+\code{\link[ggplot2]{scale_shape_manual}}.}
+
+\item{title}{(Optional). Default \code{NULL}. Character string.
+The main title for the graphic.}
+
+\item{scales}{(Optional). Default \code{"free_y"}.
+Whether to let vertical axis have free scale that adjusts to
+the data in each panel.
+This argument is passed to \code{\link[ggplot2]{facet_wrap}}.
+If set to \code{"fixed"}, a single vertical scale will
+be used in all panels. This can obscure values if the
+\code{measures} argument includes both
+richness estimates and diversity indices, for example.}
+
+\item{nrow}{(Optional). Default is \code{1},
+meaning that all plot panels will be placed in a single row,
+side-by-side.
+This argument is passed to \code{\link[ggplot2]{facet_wrap}}.
+If \code{NULL}, the number of rows and columns will be
+chosen automatically (wrapped) based on the number of panels
+and the size of the graphics device.}
+
+\item{shsi}{(Deprecated). No longer supported. Instead see `measures` below.}
+
+\item{measures}{(Optional). Default is \code{NULL}, meaning that
+all available alpha-diversity measures will be included in plot panels.
+Alternatively, you can specify one or more measures
+as a character vector of measure names.
+Values must be among those supported:
+\code{c("Observed", "Chao1", "ACE", "Shannon", "Simpson", "InvSimpson", "Fisher")}.}
+
+\item{sortby}{(Optional). A character string subset of \code{measures} argument.
+Sort x-indices by the mean of one or more \code{measures},
+if x-axis is mapped to a discrete variable.
+Default is \code{NULL}, implying that a discrete-value horizontal axis
+will use default sorting, usually alphabetic.}
+}
+\value{
+A \code{\link{ggplot}} plot object summarizing
+ the richness estimates, and their standard error.
+}
+\description{
+There are many useful examples of alpha-diversity graphics in the
+\href{http://joey711.github.io/phyloseq/plot_richness-examples}{phyloseq online tutorials}.
+This function estimates a number of alpha-diversity metrics using the
+\code{\link{estimate_richness}} function,
+and returns a \code{ggplot} plotting object.
+The plot generated by this function will include every sample
+in \code{physeq}, but they can be further grouped on the horizontal axis
+through the argument to \code{x},
+and shaded according to the argument to \code{color} (see below).
+You must use untrimmed, non-normalized count data for meaningful results,
+as many of these estimates are highly dependent on the number of singletons.
+You can always trim the data later on if needed,
+just not before using this function.
+}
+\details{
+NOTE: Because this plotting function incorporates the output from
+ \code{\link{estimate_richness}}, the variable names of that output should
+ not be used as \code{x} or \code{color} (even if it works, the resulting
+ plot might be kindof strange, and not the intended behavior of this function).
+ The following are the names you will want to avoid using in \code{x} or \code{color}:
+
+\code{c("Observed", "Chao1", "ACE", "Shannon", "Simpson", "InvSimpson", "Fisher")}.
+}
+\examples{
+## There are many more interesting examples at the phyloseq online tutorials.
+## http://joey711.github.io/phyloseq/plot_richness-examples
+data("soilrep")
+plot_richness(soilrep, measures=c("InvSimpson", "Fisher"))
+plot_richness(soilrep, "Treatment", "warmed", measures=c("Chao1", "ACE", "InvSimpson"), nrow=3)
+data("GlobalPatterns")
+plot_richness(GlobalPatterns, x="SampleType", measures=c("InvSimpson"))
+plot_richness(GlobalPatterns, x="SampleType", measures=c("Chao1", "ACE", "InvSimpson"), nrow=3)
+plot_richness(GlobalPatterns, x="SampleType", measures=c("Chao1", "ACE", "InvSimpson"), nrow=3, sortby = "Chao1")
+}
+\seealso{
+\code{\link{estimate_richness}}
+
+ \code{\link[vegan]{estimateR}}
+
+ \code{\link[vegan]{diversity}}
+
+There are many more interesting examples at the
+\href{http://joey711.github.io/phyloseq/plot_richness-examples}{phyloseq online tutorials}.
+}
+
diff --git a/man/plot_scree.Rd b/man/plot_scree.Rd
new file mode 100644
index 0000000..732ad81
--- /dev/null
+++ b/man/plot_scree.Rd
@@ -0,0 +1,55 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/plot-methods.R
+\name{plot_scree}
+\alias{plot_scree}
+\title{General ordination eigenvalue plotter using ggplot2.}
+\usage{
+plot_scree(ordination, title = NULL)
+}
+\arguments{
+\item{ordination}{(Required). An ordination object. Many different classes
+of ordination are defined by \code{R} packages. Ordination classes
+currently supported/created by the \code{\link{ordinate}} function are
+supported here.
+There is no default, as the expectation is that the
+ordination will be performed and saved prior to calling this plot function.}
+
+\item{title}{(Optional). Default \code{NULL}. Character string.
+The main title for the graphic.}
+}
+\value{
+A \code{\link{ggplot}} plot object, graphically summarizing
+ the ordination result for the specified axes.
+}
+\description{
+Convenience wrapper for plotting ordination eigenvalues (if available)
+using a \code{ggplot2}-graphic.
+}
+\examples{
+# First load and trim a dataset
+data("GlobalPatterns")
+GP = prune_taxa(names(sort(taxa_sums(GlobalPatterns), TRUE)[1:50]), GlobalPatterns)
+# Test plots (preforms ordination in-line, then makes scree plot)
+plot_scree(ordinate(GP, "DPCoA", "bray"))
+plot_scree(ordinate(GP, "PCoA", "bray"))
+# Empty return with message
+plot_scree(ordinate(GP, "NMDS", "bray"))
+# Constrained ordinations
+plot_scree(ordinate(GP, "CCA", formula=~SampleType))
+plot_scree(ordinate(GP, "RDA", formula=~SampleType))
+plot_scree(ordinate(GP, "CAP", formula=~SampleType))
+# Deprecated example of constrained ordination (emits a warning)
+#plot_scree(ordinate(GP ~ SampleType, "RDA"))
+plot_scree(ordinate(GP, "DCA"))
+plot_ordination(GP, ordinate(GP, "DCA"), type="scree")
+}
+\seealso{
+\code{\link{plot_ordination}}
+
+ \code{\link{ordinate}}
+
+ \code{\link{distance}}
+
+ \href{http://joey711.github.io/phyloseq/plot_ordination-examples}{phyloseq online tutorials}
+}
+
diff --git a/man/plot_tree.Rd b/man/plot_tree.Rd
new file mode 100644
index 0000000..6bdbcda
--- /dev/null
+++ b/man/plot_tree.Rd
@@ -0,0 +1,185 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/plot-methods.R
+\name{plot_tree}
+\alias{plot_tree}
+\title{Plot a phylogenetic tree with optional annotations}
+\usage{
+plot_tree(physeq, method = "sampledodge", nodelabf = NULL, color = NULL,
+ shape = NULL, size = NULL, min.abundance = Inf, label.tips = NULL,
+ text.size = NULL, sizebase = 5, base.spacing = 0.02,
+ ladderize = FALSE, plot.margin = 0.2, title = NULL, treetheme = NULL,
+ justify = "jagged")
+}
+\arguments{
+\item{physeq}{(Required). The data about which you want to
+plot and annotate a phylogenetic tree, in the form of a
+single instance of the \code{\link{phyloseq-class}}, containing at
+minimum a phylogenetic tree component (try \code{\link{phy_tree}}).
+One of the major advantages of this function over basic tree-plotting utilities
+in the \code{\link{ape}}-package is the ability to easily annotate the tree
+with sample variables and taxonomic information. For these uses,
+the \code{physeq} argument should also have a \code{\link{sample_data}}
+and/or \code{\link{tax_table}} component(s).}
+
+\item{method}{(Optional). Character string. Default \code{"sampledodge"}.
+The name of the annotation method to use.
+This will be expanded in future versions.
+Currently only \code{"sampledodge"} and \code{"treeonly"} are supported.
+The \code{"sampledodge"} option results in points
+drawn next to leaves if individuals from that taxa were observed,
+and a separate point is drawn for each sample.}
+
+\item{nodelabf}{(Optional). A function. Default \code{NULL}.
+If \code{NULL}, the default, a function will be selected for you based upon
+whether or not there are node labels in \code{phy_tree(physeq)}.
+For convenience, the phyloseq package includes two generator functions
+for adding arbitrary node labels (can be any character string),
+\code{\link{nodeplotdefault}};
+as well as for adding bootstrap values in a certain range,
+\code{\link{nodeplotboot}}.
+To not have any node labels in the graphic, set this argument to
+\code{\link{nodeplotblank}}.}
+
+\item{color}{(Optional). Character string. Default \code{NULL}.
+The name of the variable in \code{physeq} to map to point color.
+Supported options here also include the reserved special variables
+of \code{\link{psmelt}}.}
+
+\item{shape}{(Optional). Character string. Default \code{NULL}.
+The name of the variable in \code{physeq} to map to point shape.
+Supported options here also include the reserved special variables
+of \code{\link{psmelt}}.}
+
+\item{size}{(Optional). Character string. Default \code{NULL}.
+The name of the variable in \code{physeq} to map to point size.
+A special argument \code{"abundance"} is reserved here and scales
+point size using abundance in each sample on a log scale.
+Supported options here also include the reserved special variables
+of \code{\link{psmelt}}.}
+
+\item{min.abundance}{(Optional). Numeric.
+The minimum number of individuals required to label a point
+with the precise number.
+Default is \code{Inf},
+meaning that no points will have their abundance labeled.
+If a vector, only the first element is used.}
+
+\item{label.tips}{(Optional). Character string. Default is \code{NULL},
+indicating that no tip labels will be printed.
+If \code{"taxa_names"}, then the name of the taxa will be added
+to the tree; either next to the leaves, or next to
+the set of points that label the leaves. Alternatively,
+if this is one of the rank names (from \code{rank_names(physeq)}),
+then the identity (if any) for that particular taxonomic rank
+is printed instead.}
+
+\item{text.size}{(Optional). Numeric. Should be positive. The
+size parameter used to control the text size of taxa labels.
+Default is \code{NULL}. If left \code{NULL}, this function
+will automatically calculate a (hopefully) optimal text size
+given the vertical constraints posed by the tree itself.
+This argument is included in case the
+automatically-calculated size is wrong, and you want to change it.
+Note that this parameter is only meaningful if \code{label.tips}
+is not \code{NULL}.}
+
+\item{sizebase}{(Optional). Numeric. Should be positive.
+The base of the logarithm used
+to scale point sizes to graphically represent abundance of
+species in a given sample. Default is 5.}
+
+\item{base.spacing}{(Optional). Numeric. Default is \code{0.02}.
+Should be positive.
+This defines the base-spacing between points at each tip/leaf in the
+the tree. The larger this value, the larger the spacing between points.
+This is useful if you have problems with overlapping large points
+and/or text indicating abundance, for example. Similarly, if you
+don't have this problem and want tighter point-spacing, you can
+shrink this value.}
+
+\item{ladderize}{(Optional). Boolean or character string (either
+\code{FALSE}, \code{TRUE}, or \code{"left"}).
+Default is \code{FALSE}.
+This parameter specifies whether or not to \code{\link[ape]{ladderize}} the tree
+(i.e., reorder nodes according to the depth of their enclosed
+subtrees) prior to plotting.
+This tends to make trees more aesthetically pleasing and legible in
+a graphical display.
+When \code{TRUE} or \code{"right"}, ``right'' ladderization is used.
+When set to \code{FALSE}, no ladderization is applied.
+When set to \code{"left"}, the reverse direction
+(``left'' ladderization) is applied.
+This argument is passed on to \code{\link{tree_layout}}.}
+
+\item{plot.margin}{(Optional). Numeric. Default is \code{0.2}.
+Should be positive.
+This defines how much right-hand padding to add to the tree plot,
+which can be required to not truncate tip labels. The margin value
+is specified as a fraction of the overall tree width which is added
+to the right side of the plot area. So a value of \code{0.2} adds
+twenty percent extra space to the right-hand side of the plot.}
+
+\item{title}{(Optional). Default \code{NULL}. Character string.
+The main title for the graphic.}
+
+\item{treetheme}{(Optional).
+A custom \code{\link{ggplot}}2 \code{\link[ggplot2]{theme}} layer
+to use for the tree. Supplants any default theme layers
+used within the function.
+A value of \code{NULL} uses a default, minimal-annotations theme.
+If anything other than a them or \code{NULL}, the current global ggplot2
+theme will result.}
+
+\item{justify}{(Optional). A character string indicating the
+type of justification to use on dodged points and tip labels.
+A value of \code{"jagged"}, the default, results in
+these tip-mapped elements being spaced as close to the tips as possible
+without gaps.
+Currently, any other value for \code{justify} results in
+a left-justified arrangement of both labels and points.}
+}
+\value{
+A \code{\link{ggplot}}2 plot.
+}
+\description{
+There are many useful examples of phyloseq tree graphics in the
+\href{http://joey711.github.io/phyloseq/plot_tree-examples}{phyloseq online tutorials}.
+This function is intended to facilitate easy graphical investigation of
+the phylogenetic tree, as well as sample data. Note that for phylogenetic
+sequencing of samples with large richness, some of the options in this
+function will be prohibitively slow to render, or too dense to be
+interpretable. A rough ``rule of thumb'' is to use subsets of data
+with not many more than 200 OTUs per plot, sometimes less depending on the
+complexity of the additional annotations being mapped to the tree. It is
+usually possible to create an unreadable, uninterpretable tree with modern
+datasets. However, the goal should be toward parameter settings and data
+subsets that convey (honestly, accurately) some biologically relevant
+feature of the data. One of the goals of the \code{\link{phyloseq-package}}
+is to make the determination of these features/settings as easy as possible.
+}
+\details{
+This function received an early development contribution from the work of
+Gregory Jordan via \href{https://github.com/gjuggler/ggphylo}{the ggphylo package}.
+\code{plot_tree} has since been re-written.
+For details see \code{\link{tree_layout}}.
+}
+\examples{
+# # Using plot_tree() with the esophagus dataset.
+# # Please note that many more interesting examples are shown
+# # in the online tutorials"
+# # http://joey711.github.io/phyloseq/plot_tree-examples
+data(esophagus)
+# plot_tree(esophagus)
+# plot_tree(esophagus, color="Sample")
+# plot_tree(esophagus, size="Abundance")
+# plot_tree(esophagus, size="Abundance", color="samples")
+plot_tree(esophagus, size="Abundance", color="Sample", base.spacing=0.03)
+plot_tree(esophagus, size="abundance", color="samples", base.spacing=0.03)
+}
+\seealso{
+\code{\link{plot.phylo}}
+
+There are many useful examples of phyloseq tree graphics in the
+\href{http://joey711.github.io/phyloseq/plot_tree-examples}{phyloseq online tutorials}.
+}
+
diff --git a/man/prune_samples-methods.Rd b/man/prune_samples-methods.Rd
new file mode 100644
index 0000000..0750bca
--- /dev/null
+++ b/man/prune_samples-methods.Rd
@@ -0,0 +1,50 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/transform_filter-methods.R
+\docType{methods}
+\name{prune_samples}
+\alias{prune_samples}
+\alias{prune_samples,character,otu_table-method}
+\alias{prune_samples,character,phyloseq-method}
+\alias{prune_samples,character,sample_data-method}
+\alias{prune_samples,logical,ANY-method}
+\title{Define a subset of samples to keep in a phyloseq object.}
+\usage{
+prune_samples(samples, x)
+
+\S4method{prune_samples}{character,otu_table}(samples, x)
+
+\S4method{prune_samples}{character,sample_data}(samples, x)
+
+\S4method{prune_samples}{character,phyloseq}(samples, x)
+
+\S4method{prune_samples}{logical,ANY}(samples, x)
+}
+\arguments{
+\item{samples}{(Required). A character vector of the samples in object x that you want to
+keep -- OR alternatively -- a logical vector where the kept samples are TRUE, and length
+is equal to the number of samples in object x. If \code{samples} is a named
+logical, the samples retained is based on those names. Make sure they are
+compatible with the \code{sample_names} of the object you are modifying (\code{x}).}
+
+\item{x}{A phyloseq object.}
+}
+\value{
+The class of the object returned by \code{prune_samples} matches
+the class of the phyloseq object, \code{x}.
+}
+\description{
+An S4 Generic method for pruning/filtering unwanted samples
+by defining those you want to keep.
+}
+\examples{
+ data(GlobalPatterns)
+ # Subset to just the Chlamydiae phylum.
+ GP.chl <- subset_taxa(GlobalPatterns, Phylum=="Chlamydiae")
+ # Remove the samples that have less than 20 total reads from Chlamydiae
+ GP.chl <- prune_samples(sample_sums(GP.chl)>=20, GP.chl)
+ # (p <- plot_tree(GP.chl, color="SampleType", shape="Family", label.tips="Genus", size="abundance"))
+}
+\seealso{
+\code{\link{subset_samples}}
+}
+
diff --git a/man/prune_taxa-methods.Rd b/man/prune_taxa-methods.Rd
new file mode 100644
index 0000000..37cbfc3
--- /dev/null
+++ b/man/prune_taxa-methods.Rd
@@ -0,0 +1,71 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/transform_filter-methods.R
+\docType{methods}
+\name{prune_taxa}
+\alias{prune_taxa}
+\alias{prune_taxa,NULL,ANY-method}
+\alias{prune_taxa,character,XStringSet-method}
+\alias{prune_taxa,character,otu_table-method}
+\alias{prune_taxa,character,phylo-method}
+\alias{prune_taxa,character,phyloseq-method}
+\alias{prune_taxa,character,sample_data-method}
+\alias{prune_taxa,character,taxonomyTable-method}
+\alias{prune_taxa,logical,ANY-method}
+\title{Prune unwanted OTUs / taxa from a phylogenetic object.}
+\usage{
+prune_taxa(taxa, x)
+
+\S4method{prune_taxa}{`NULL`,ANY}(taxa, x)
+
+\S4method{prune_taxa}{logical,ANY}(taxa, x)
+
+\S4method{prune_taxa}{character,phylo}(taxa, x)
+
+\S4method{prune_taxa}{character,otu_table}(taxa, x)
+
+\S4method{prune_taxa}{character,sample_data}(taxa, x)
+
+\S4method{prune_taxa}{character,phyloseq}(taxa, x)
+
+\S4method{prune_taxa}{character,taxonomyTable}(taxa, x)
+
+\S4method{prune_taxa}{character,XStringSet}(taxa, x)
+}
+\arguments{
+\item{taxa}{(Required). A character vector of the taxa in object x that you want to
+keep -- OR alternatively -- a logical vector where the kept taxa are TRUE, and length
+is equal to the number of taxa in object x. If \code{taxa} is a named
+logical, the taxa retained are based on those names. Make sure they are
+compatible with the \code{taxa_names} of the object you are modifying (\code{x}).}
+
+\item{x}{(Required). A phylogenetic object, including \code{phylo} trees,
+as well as all phyloseq classes that represent taxa. If the function
+\code{\link{taxa_names}} returns a non-\code{NULL} value, then your object
+can be pruned by this function.}
+}
+\value{
+The class of the object returned by \code{prune_taxa} matches
+the class of the argument, \code{x}.
+}
+\description{
+An S4 Generic method for removing (pruning) unwanted OTUs/taxa from phylogenetic
+objects, including phylo-class trees, as well as native phyloseq package
+objects. This is particularly useful for pruning a phyloseq object that has
+more than one component that describes OTUs.
+Credit: the \code{phylo}-class version is adapted from
+\href{http://cran.at.r-project.org/web/packages/picante/index.html}{prune.sample}.
+}
+\examples{
+data("esophagus")
+esophagus
+plot(sort(taxa_sums(esophagus), TRUE), type="h", ylim=c(0, 50))
+x1 = prune_taxa(taxa_sums(esophagus) > 10, esophagus)
+x2 = prune_taxa(names(sort(taxa_sums(esophagus), TRUE))[1:9], esophagus)
+identical(x1, x2)
+}
+\seealso{
+\code{\link{prune_samples}}
+
+ \href{http://cran.at.r-project.org/web/packages/picante/index.html}{prune.sample}
+}
+
diff --git a/man/psmelt.Rd b/man/psmelt.Rd
new file mode 100644
index 0000000..e7e4f48
--- /dev/null
+++ b/man/psmelt.Rd
@@ -0,0 +1,73 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/plot-methods.R
+\name{psmelt}
+\alias{psmelt}
+\title{Melt phyloseq data object into large data.frame}
+\usage{
+psmelt(physeq)
+}
+\arguments{
+\item{physeq}{(Required). An \code{\link{otu_table-class}} or
+\code{\link{phyloseq-class}}. Function most useful for phyloseq-class.}
+}
+\value{
+A \code{\link{data.frame}}-class table.
+}
+\description{
+The psmelt function is a specialized melt function for melting phyloseq objects
+(instances of the phyloseq class), usually for producing graphics
+with \code{\link[ggplot2]{ggplot}2}. \code{psmelt} relies heavily on the
+\code{\link[reshape2]{melt}} and \code{\link{merge}} functions.
+The naming conventions used in downstream phyloseq graphics functions
+have reserved the following variable names that should not be used
+as the names of \code{\link{sample_variables}}
+or taxonomic \code{\link{rank_names}}.
+These reserved names are \code{c("Sample", "Abundance", "OTU")}.
+Also, you should not have identical names for
+sample variables and taxonomic ranks.
+That is, the intersection of the output of the following two functions
+\code{\link{sample_variables}}, \code{\link{rank_names}}
+should be an empty vector
+(e.g. \code{intersect(sample_variables(physeq), rank_names(physeq))}).
+All of these potential name collisions are checked-for
+and renamed automtically with a warning.
+However, if you (re)name your variables accordingly ahead of time,
+it will reduce confusion and eliminate the warnings.
+}
+\details{
+Note that
+``melted'' phyloseq data is stored much less efficiently,
+and so RAM storage issues could arise with a smaller dataset
+(smaller number of samples/OTUs/variables) than one might otherwise expect.
+For common sizes of graphics-ready datasets, however,
+this should not be a problem.
+Because the number of OTU entries has a large effect on the RAM requirement,
+methods to reduce the number of separate OTU entries --
+for instance by agglomerating OTUs based on phylogenetic distance
+using \code{\link{tip_glom}} --
+can help alleviate RAM usage problems.
+This function is made user-accessible for flexibility,
+but is also used extensively by plot functions in phyloseq.
+}
+\examples{
+data("GlobalPatterns")
+gp.ch = subset_taxa(GlobalPatterns, Phylum == "Chlamydiae")
+mdf = psmelt(gp.ch)
+nrow(mdf)
+ncol(mdf)
+colnames(mdf)
+head(rownames(mdf))
+# Create a ggplot similar to
+library("ggplot2")
+p = ggplot(mdf, aes(x=SampleType, y=Abundance, fill=Genus))
+p = p + geom_bar(color="black", stat="identity", position="stack")
+print(p)
+}
+\seealso{
+\code{\link{plot_bar}}
+
+ \code{\link[reshape2]{melt}}
+
+ \code{\link{merge}}
+}
+
diff --git a/man/rank_names.Rd b/man/rank_names.Rd
new file mode 100644
index 0000000..3884e62
--- /dev/null
+++ b/man/rank_names.Rd
@@ -0,0 +1,33 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/almostAllAccessors.R
+\name{rank_names}
+\alias{rank_names}
+\title{Retrieve the names of the taxonomic ranks}
+\usage{
+rank_names(physeq, errorIfNULL=TRUE)
+}
+\arguments{
+\item{physeq}{(Required).
+\code{\link{taxonomyTable-class}}, or \code{\link{phyloseq-class}}.}
+
+\item{errorIfNULL}{(Optional). Logical. Should the accessor stop with
+an error if the slot is empty (\code{NULL})? Default \code{TRUE}.}
+}
+\value{
+Character vector. The names of the available taxonomic ranks.
+}
+\description{
+This is a simple accessor function to make it more convenient to determine
+the taxonomic ranks that are available in a given \code{\link{phyloseq-class}}
+object.
+}
+\examples{
+data(enterotype)
+rank_names(enterotype)
+}
+\seealso{
+\code{\link{get_taxa}}
+ \code{\link{taxa_names}}
+ \code{\link{sample_names}}
+}
+
diff --git a/man/rarefy_even_depth.Rd b/man/rarefy_even_depth.Rd
new file mode 100644
index 0000000..cfa6f9b
--- /dev/null
+++ b/man/rarefy_even_depth.Rd
@@ -0,0 +1,126 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/transform_filter-methods.R
+\name{rarefy_even_depth}
+\alias{rarefy_even_depth}
+\title{Resample an OTU table such that all samples have the same library size.}
+\usage{
+rarefy_even_depth(physeq, sample.size = min(sample_sums(physeq)),
+ rngseed = FALSE, replace = TRUE, trimOTUs = TRUE, verbose = TRUE)
+}
+\arguments{
+\item{physeq}{(Required). A \code{\link{phyloseq-class}} object that you
+want to trim/filter.}
+
+\item{sample.size}{(Optional). A single integer value equal to the number
+of reads being simulated, also known as the depth,
+and also equal to each value returned by \code{\link{sample_sums}}
+on the output.}
+
+\item{rngseed}{(Optional). A single integer value passed to
+\code{\link{set.seed}}, which is used to fix a seed for reproducibly
+random number generation (in this case, reproducibly random subsampling).
+The default value is \code{711}.
+If set to \code{FALSE}, then no fiddling with the RNG seed is performed,
+and it is up to the user to appropriately call \code{\link{set.seed}}
+beforehand to achieve reproducible results.}
+
+\item{replace}{(Optional). Logical. Whether to sample with replacement
+(\code{TRUE}) or without replacement (\code{FALSE}).
+The default is with replacement (\code{replace=TRUE}).
+Two implications to consider are that
+(1) sampling with replacement is faster and more memory efficient
+as currently implemented; and
+(2), sampling with replacement means that there is a chance that the
+number of reads for a given OTU in a given sample could be larger
+than the original count value, as opposed to sampling without replacement
+where the original count value is the maximum possible.
+Prior to phyloseq package version number \code{1.5.20},
+this parameter did not exist and sampling with replacement was the only
+random subsampling implemented in the \code{rarefy_even_depth} function.
+Note that this default behavior was selected for computational efficiency,
+but differs from analogous functions in related packages
+(e.g. subsampling in QIIME).}
+
+\item{trimOTUs}{(Optional). \code{\link{logical}(1)}.
+Whether to trim OTUs
+from the dataset that are no longer observed in any sample
+(have a count of zero in every sample).
+The number of OTUs trimmed, if any, is printed to
+standard out as a reminder.}
+
+\item{verbose}{(Optional). Logical. Default is \code{TRUE}.
+If \code{TRUE}, extra non-warning, non-error messages are printed
+to standard out, describing steps in the rarefying process,
+the OTUs and samples removed, etc. This can be useful the
+first few times the function is executed, but can be set
+to \code{FALSE} as-needed once behavior has been verified
+as expected.}
+}
+\value{
+An object of class \code{phyloseq}.
+Only the \code{otu_table} component is modified.
+}
+\description{
+Please note that the authors of phyloseq do not advocate using this
+as a normalization procedure, despite its recent popularity.
+Our justifications for using alternative approaches to address
+disparities in library sizes have been made available as
+\href{http://dx.plos.org/10.1371/journal.pcbi.1003531}{an article in PLoS Computational Biology}.
+See \code{\link{phyloseq_to_deseq2}} for a recommended alternative to rarefying
+directly supported in the phyloseq package, as well as
+\href{http://joey711.github.io/waste-not-supplemental/}{the supplemental materials for the PLoS-CB article}
+and \href{http://joey711.github.io/phyloseq-extensions}{the phyloseq extensions repository on GitHub}.
+Nevertheless, for comparison and demonstration, the rarefying procedure is implemented
+here in good faith and with options we hope are useful.
+This function uses the standard R \code{\link{sample}} function to
+resample from the abundance values
+in the \code{\link{otu_table}} component of the first argument,
+\code{physeq}.
+Often one of the major goals of this procedure is to achieve parity in
+total number of counts between samples, as an alternative to other formal
+normalization procedures, which is why a single value for the
+\code{sample.size} is expected.
+This kind of resampling can be performed with and without replacement,
+with replacement being the more computationally-efficient, default setting.
+See the \code{replace} parameter documentation for more details.
+We recommended that you explicitly select a random number generator seed
+before invoking this function, or, alternatively, that you
+explicitly provide a single positive integer argument as \code{rngseed}.
+}
+\details{
+This approach is sometimes mistakenly called ``rarefaction'', which
+\href{http://en.wikipedia.org/wiki/Rarefaction}{in physics refers to a form of wave decompression;}
+but in this context, ecology, the term refers to a
+\href{http://en.wikipedia.org/wiki/Rarefaction_(ecology)}{repeated sampling procedure to assess species richness},
+first proposed in 1968 by Howard Sanders.
+In contrast, the procedure implemented here is used as an \emph{ad hoc} means to
+normalize microbiome counts that have
+resulted from libraries of widely-differing sizes.
+Here we have intentionally adopted an alternative
+name, \code{rarefy}, that has also been used recently
+to describe this process
+and, to our knowledge, not previously used in ecology.
+
+Make sure to use \code{\link{set.seed}} for exactly-reproducible results
+of the random subsampling.
+}
+\examples{
+# Test with esophagus dataset
+data("esophagus")
+esorepT = rarefy_even_depth(esophagus, replace=TRUE)
+esorepF = rarefy_even_depth(esophagus, replace=FALSE)
+sample_sums(esophagus)
+sample_sums(esorepT)
+sample_sums(esorepF)
+## NRun Manually: Too slow!
+# data("GlobalPatterns")
+# GPrepT = rarefy_even_depth(GlobalPatterns, 1E5, replace=TRUE)
+## Actually just this one is slow
+# system.time(GPrepF <- rarefy_even_depth(GlobalPatterns, 1E5, replace=FALSE))
+}
+\seealso{
+\code{\link{sample}}
+
+\code{\link{set.seed}}
+}
+
diff --git a/man/read_tree.Rd b/man/read_tree.Rd
new file mode 100644
index 0000000..ca99829
--- /dev/null
+++ b/man/read_tree.Rd
@@ -0,0 +1,51 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/IO-methods.R
+\name{read_tree}
+\alias{read_tree}
+\title{Somewhat flexible tree-import function}
+\usage{
+read_tree(treefile, errorIfNULL=FALSE, ...)
+}
+\arguments{
+\item{treefile}{(Required). A character string implying a file \code{\link{connection}}
+(like a path or URL), or an actual \code{\link{connection}}.
+Must be a Newick- or Nexus-formatted tree.}
+
+\item{errorIfNULL}{(Optional). Logical. Should an error be thrown if no tree
+can be extracted from the connection?
+Default is \code{FALSE}, indicating that \code{NULL} will be
+SILENTLY returned, rather than an error.
+Be cautious about this behavior. Useful for phyloseq internals, but might
+be hard to track in your own code if you're not aware of this
+``no error by default'' setting. If this is a problem, change this value
+to \code{TRUE}, and you can still use the function.}
+
+\item{...}{(Optional). Additional parameter(s) passed to the
+relevant tree-importing function.}
+}
+\value{
+If successful, returns a \code{\link{phylo}}-class object as defined
+ in the \code{\link[ape]{ape-package}}. Returns NULL if neither tree-reading function worked.
+}
+\description{
+This function is a convenience wrapper around the
+\code{\link[ape]{read.tree}} (Newick-format) and
+\code{\link[ape]{read.nexus}} (Nexus-format) importers provided by
+the \code{\link[ape]{ape-package}}. This function attempts to return a valid
+tree if possible using either format importer. If it fails, it silently
+returns \code{NULL} by default, rather than throwing a show-stopping error.
+}
+\examples{
+read_tree(system.file("extdata", "esophagus.tree.gz", package="phyloseq"))
+read_tree(system.file("extdata", "GP_tree_rand_short.newick.gz", package="phyloseq"))
+}
+\seealso{
+\code{\link{read_tree_greengenes}}
+
+\code{\link{phylo}}
+
+\code{\link[ape]{read.tree}}
+
+\code{\link[ape]{read.nexus}}
+}
+
diff --git a/man/read_tree_greengenes.Rd b/man/read_tree_greengenes.Rd
new file mode 100644
index 0000000..b55f3bf
--- /dev/null
+++ b/man/read_tree_greengenes.Rd
@@ -0,0 +1,57 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/IO-methods.R
+\name{read_tree_greengenes}
+\alias{read_tree_greengenes}
+\title{Read GreenGenes tree released in annotated newick format}
+\usage{
+read_tree_greengenes(treefile)
+}
+\arguments{
+\item{treefile}{(Required). A character string implying
+a file \code{\link{connection}}
+(like a path or URL), or an actual \code{\link{connection}}.
+Must be a Newick--formatted tree released by GreenGenes
+in October 2012 or later.
+The similarity threshold of the OTUs should not matter,
+except that it should match your OTU table.}
+}
+\value{
+A tree, represented as a \code{\link{phylo}} object.
+}
+\description{
+In principal, this is a standard newick format, that can be imported
+into R using \code{\link{read_tree}},
+which in-turn utilizes \code{\link[ape]{read.tree}}.
+However, \code{\link[ape]{read.tree}} has failed to import
+recent (October 2012 and later) releases of the GreenGenes tree,
+and this problem has been traced to the additional annotations
+added to some internal nodes
+that specify taxonomic classification between single-quotes.
+To solve this problem and create a clear container
+for fixing future problems with the format of GreenGenes-released trees,
+this function is available in phyloseq and exported for users.
+It is also referenced in the documentation of the import functions
+for QIIME legacy and BIOM format importers --
+\code{\link{import_qiime}} and \code{\link{import_biom}}, respectively.
+However, since the precise format of the tree is not restricted to GreenGenes trees
+by QIIME or for the biom-format, this function is not called
+automatically by those aforementioned import functions.
+If your tree is formatted like, or is one of, the official GreenGenes
+release trees, then you should use this function and provide its output
+to your relevant import function.
+}
+\examples{
+# Read the May 2013, 73\% similarity official tree,
+# included as extra data in phyloseq.
+treefile = system.file("extdata", "gg13-5-73.tree.gz", package="phyloseq")
+x = read_tree_greengenes(treefile)
+x
+class(x)
+y = read_tree(treefile)
+y
+class(y)
+## Not run, causes an error:
+# library("ape")
+# read.tree(treefile)
+}
+
diff --git a/man/reconcile_categories.Rd b/man/reconcile_categories.Rd
new file mode 100644
index 0000000..f041b3d
--- /dev/null
+++ b/man/reconcile_categories.Rd
@@ -0,0 +1,35 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/sampleData-class.R
+\name{reconcile_categories}
+\alias{reconcile_categories}
+\title{Cleans absent levels in sample_data/data.frame.}
+\usage{
+reconcile_categories(DFSM)
+}
+\arguments{
+\item{DFSM}{(Required). A \code{data.frame} or \code{sample_data} object that needs to be cleaned.}
+}
+\value{
+A single \code{data.frame} object. Even if the input argument is a \code{sample_data},
+ the return is a \code{data.frame}. Because this is intended to be used internally by
+ the builder method, it cannot also call the builder function to re-build
+ the cleaned \code{sample_data}.
+}
+\description{
+This is used internally by the builder method, \code{\link{sample_data}}, to
+ensure that the factors describing categorical variables in a data.frame or
+sample_data object are free of extra levels that can plague downstream plots
+analysis.
+}
+\examples{
+# # # data(GlobalPatterns)
+# # # SM <- sample_data(GlobalPatterns)
+# # # DF <- data.frame(SM)
+# # # DF <- data.frame(DF, col1=1:nrow(DF), col2=paste(1:nrow(DF), "t", sep=""))
+# # # DF <- reconcile_categories(DF)
+# # # SM <- sample_data(reconcile_categories(SM))
+# # # sapply(DF, class)
+# # # sapply(SM, class)
+}
+\keyword{internal}
+
diff --git a/man/refseq-methods.Rd b/man/refseq-methods.Rd
new file mode 100644
index 0000000..bd395d0
--- /dev/null
+++ b/man/refseq-methods.Rd
@@ -0,0 +1,49 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/almostAllAccessors.R
+\docType{methods}
+\name{refseq}
+\alias{refseq}
+\alias{refseq,ANY-method}
+\alias{refseq,XStringSet-method}
+\title{Retrieve reference sequences (\code{\link[Biostrings]{XStringSet}}-class) from object.}
+\usage{
+refseq(physeq, errorIfNULL=TRUE)
+
+\S4method{refseq}{ANY}(physeq, errorIfNULL = TRUE)
+
+\S4method{refseq}{XStringSet}(physeq)
+}
+\arguments{
+\item{physeq}{(Required). An instance of phyloseq-class
+that contains a phylogenetic tree. If physeq is a phylogenetic
+tree (a component data class), then it is returned as-is.}
+
+\item{errorIfNULL}{(Optional). Logical. Should the accessor stop with
+an error if the slot is empty (\code{NULL})? Default \code{TRUE}.}
+}
+\value{
+The \code{\link[ape]{phylo}}-class object contained within \code{physeq};
+ or NULL if \code{physeq} does not have a tree.
+ This method stops with an error in the latter NULL case be default, which
+ can be over-ridden by changing the value of \code{errorIfNULL} to \code{FALSE}.
+}
+\description{
+This is the suggested method
+for accessing
+the phylogenetic tree, (\code{\link[Biostrings]{XStringSet}}-class)
+from a phyloseq data object (\code{\link{phyloseq-class}}).
+Like other accessors (see See Also, below), the default behavior of this method
+is to stop with an
+error if \code{physeq} is a \code{phyloseq-class} but does not
+contain reference sequences (the component data type you are trying to access in this case).
+}
+\examples{
+ data(GlobalPatterns)
+ refseq(GlobalPatterns, FALSE)
+}
+\seealso{
+\code{\link{otu_table}}, \code{\link{sample_data}}, \code{\link{tax_table}}
+ \code{\link{phy_tree}},
+ \code{\link{phyloseq}}, \code{\link{merge_phyloseq}}
+}
+
diff --git a/man/rm_outlierf.Rd b/man/rm_outlierf.Rd
new file mode 100644
index 0000000..5915e7c
--- /dev/null
+++ b/man/rm_outlierf.Rd
@@ -0,0 +1,40 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/transform_filter-methods.R
+\name{rm_outlierf}
+\alias{rm_outlierf}
+\title{Set to FALSE any outlier species greater than f fractional abundance.}
+\usage{
+rm_outlierf(f, na.rm=TRUE)
+}
+\arguments{
+\item{f}{Single numeric value between 0 and 1. The maximum fractional abundance
+value that a taxa will be allowed to have in a sample without being marked
+for trimming.}
+
+\item{na.rm}{Logical. Should we remove NA values. Default \code{TRUE}.}
+}
+\value{
+A function (enclosure), suitable for \code{\link{filterfun_sample}}.
+}
+\description{
+This is for removing overly-abundant outlier taxa, not for trimming low-abundance
+taxa.
+}
+\examples{
+t1 <- 1:10; names(t1)<-paste("t", 1:10, sep="")
+rm_outlierf(0.15)(t1)
+## Use simulated abundance matrix
+set.seed(711)
+testOTU <- otu_table(matrix(sample(1:50, 25, replace=TRUE), 5, 5), taxa_are_rows=FALSE)
+taxa_sums(testOTU)
+f1 <- filterfun_sample(rm_outlierf(0.1))
+(wh1 <- genefilter_sample(testOTU, f1, A=1))
+wh2 <- c(TRUE, TRUE, TRUE, FALSE, FALSE)
+prune_taxa(wh1, testOTU)
+prune_taxa(wh2, testOTU)
+}
+\seealso{
+\code{\link{topk}}, \code{\link{topf}},
+ \code{\link{topp}}, \code{\link{rm_outlierf}}
+}
+
diff --git a/man/sample_data-class.Rd b/man/sample_data-class.Rd
new file mode 100644
index 0000000..4372902
--- /dev/null
+++ b/man/sample_data-class.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/allClasses.R
+\docType{class}
+\name{sample_data-class}
+\alias{sample_data-class}
+\title{The S4 for storing sample variables.}
+\description{
+Row indices represent samples, while column indices represent experimental
+categories, variables (and so forth) that describe the samples.
+}
+\details{
+\describe{
+
+\item{.Data}{data-frame data, inherited from the data.frame class.}
+
+\item{row.names}{
+ Also inherited from the data.frame class;
+ it should contain the sample names.
+ }
+
+\item{names}{Inherited from the data.frame class.}
+
+}
+}
+
diff --git a/man/sample_data-methods.Rd b/man/sample_data-methods.Rd
new file mode 100644
index 0000000..60396ae
--- /dev/null
+++ b/man/sample_data-methods.Rd
@@ -0,0 +1,53 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/sampleData-class.R
+\docType{methods}
+\name{sample_data}
+\alias{sample_data}
+\alias{sample_data,ANY-method}
+\alias{sample_data,data.frame-method}
+\title{Build or access sample_data.}
+\usage{
+sample_data(object, errorIfNULL=TRUE)
+
+\S4method{sample_data}{ANY}(object, errorIfNULL = TRUE)
+
+\S4method{sample_data}{data.frame}(object)
+}
+\arguments{
+\item{object}{(Required). A \code{\link{data.frame-class}},
+or a \code{\link{phyloseq-class}} object.}
+
+\item{errorIfNULL}{(Optional). Logical. Should the accessor stop with
+an error if the slot is empty (\code{NULL})? Default \code{TRUE}.}
+}
+\value{
+A \code{\link{sample_data-class}} object
+representing the sample variates of an experiment.
+}
+\description{
+This is the suggested method for both constructing and accessing a table
+of sample-level variables (\code{\link{sample_data-class}}),
+which in the \code{\link{phyloseq-package}} is represented as a special
+extension of the \code{\link{data.frame-class}}.
+When the
+argument is a \code{\link{data.frame}}, \code{sample_data} will create
+a sample_data-class object.
+In this case, the rows should be named to match the
+\code{\link{sample_names}} of the other objects to which it will ultimately be paired.
+Alternatively, if the first argument is an experiment-level (\code{\link{phyloseq-class}})
+object, then the corresponding \code{sample_data} is returned.
+Like other accessors (see See Also, below), the default behavior of this method
+is to stop with an
+error if \code{object} is a \code{phyloseq-class} but does not
+contain a \code{sample_data}.
+}
+\examples{
+#
+data(soilrep)
+head(sample_data(soilrep))
+}
+\seealso{
+\code{\link{phy_tree}}, \code{\link{tax_table}}, \code{\link{otu_table}}
+ \code{\link{phyloseq}}, \code{\link{merge_phyloseq}}
+}
+
diff --git a/man/sample_names-methods.Rd b/man/sample_names-methods.Rd
new file mode 100644
index 0000000..4c977b9
--- /dev/null
+++ b/man/sample_names-methods.Rd
@@ -0,0 +1,39 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/almostAllAccessors.R
+\docType{methods}
+\name{sample_names}
+\alias{sample_names}
+\alias{sample_names,ANY-method}
+\alias{sample_names,otu_table-method}
+\alias{sample_names,phyloseq-method}
+\alias{sample_names,sample_data-method}
+\title{Get sample names.}
+\usage{
+sample_names(physeq)
+
+\S4method{sample_names}{ANY}(physeq)
+
+\S4method{sample_names}{phyloseq}(physeq)
+
+\S4method{sample_names}{sample_data}(physeq)
+
+\S4method{sample_names}{otu_table}(physeq)
+}
+\arguments{
+\item{physeq}{(Required). A \code{\link{phyloseq-class}}, \code{\link{sample_data}},
+or \code{\link{otu_table-class}}.}
+}
+\value{
+A character vector. The names of the samples in \code{physeq}.
+}
+\description{
+Get sample names.
+}
+\examples{
+data(esophagus)
+sample_names(esophagus)
+}
+\seealso{
+\code{\link{taxa_names}}, \code{\link{nsamples}}
+}
+
diff --git a/man/sample_sums.Rd b/man/sample_sums.Rd
new file mode 100644
index 0000000..ce2d00a
--- /dev/null
+++ b/man/sample_sums.Rd
@@ -0,0 +1,31 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/otuTable-class.R
+\name{sample_sums}
+\alias{sample_sums}
+\title{Returns the total number of individuals observed from each sample.}
+\usage{
+sample_sums(x)
+}
+\arguments{
+\item{x}{\code{\link{otu_table-class}}, or \code{\link{phyloseq-class}}.}
+}
+\value{
+A named \code{\link{numeric-class}}
+ length equal to the number of samples
+ in the \code{x}, name indicating the sample ID, and value equal to the sum of
+ all individuals observed for each sample in \code{x}.
+}
+\description{
+A convenience function equivalent to rowSums or colSums, but where
+the orientation of the otu_table is automatically handled.
+}
+\examples{
+data(enterotype)
+sample_sums(enterotype)
+data(esophagus)
+sample_sums(esophagus)
+}
+\seealso{
+\code{\link{taxa_sums}}, \code{\link{rowSums}}, \code{\link{colSums}}
+}
+
diff --git a/man/sample_variables.Rd b/man/sample_variables.Rd
new file mode 100644
index 0000000..6f6eecd
--- /dev/null
+++ b/man/sample_variables.Rd
@@ -0,0 +1,33 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/almostAllAccessors.R
+\name{sample_variables}
+\alias{sample_variables}
+\title{Get the sample variables present in sample_data}
+\usage{
+sample_variables(physeq, errorIfNULL=TRUE)
+}
+\arguments{
+\item{physeq}{(Required). \code{\link{sample_data-class}}, or \code{\link{phyloseq-class}}.}
+
+\item{errorIfNULL}{(Optional). Logical. Should the accessor stop with
+an error if the slot is empty (\code{NULL})? Default \code{TRUE}.}
+}
+\value{
+Character vector. The names of the variables in the sample_data
+ data.frame. Essentially the column names. Useful for selecting model
+ and graphics parameters that interact with sample_data.
+}
+\description{
+This is a simple accessor function to make it more convenient to determine
+the sample variable names of a particular \code{\link{phyloseq-class}} object.
+}
+\examples{
+data(enterotype)
+sample_variables(enterotype)
+}
+\seealso{
+\code{\link{get_taxa}}
+ \code{\link{taxa_names}}
+ \code{\link{sample_names}}
+}
+
diff --git a/man/show-methods.Rd b/man/show-methods.Rd
new file mode 100644
index 0000000..dbc0b10
--- /dev/null
+++ b/man/show-methods.Rd
@@ -0,0 +1,34 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/show-methods.R
+\docType{methods}
+\name{show,otu_table-method}
+\alias{show,otu_table-method}
+\alias{show,phyloseq-method}
+\alias{show,sample_data-method}
+\alias{show,taxonomyTable-method}
+\title{method extensions to show for phyloseq objects.}
+\usage{
+\S4method{show}{otu_table}(object)
+
+\S4method{show}{sample_data}(object)
+
+\S4method{show}{taxonomyTable}(object)
+
+\S4method{show}{phyloseq}(object)
+}
+\arguments{
+\item{object}{Any R object}
+}
+\description{
+See the general documentation of \code{\link[methods]{show}} method for
+expected behavior.
+}
+\examples{
+# data(GlobalPatterns)
+# show(GlobalPatterns)
+# GlobalPatterns
+}
+\seealso{
+\code{\link[methods]{show}}
+}
+
diff --git a/man/show_mothur_cutoffs.Rd b/man/show_mothur_cutoffs.Rd
new file mode 100644
index 0000000..973c873
--- /dev/null
+++ b/man/show_mothur_cutoffs.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/IO-methods.R
+\name{show_mothur_cutoffs}
+\alias{show_mothur_cutoffs}
+\title{Show cutoff values available in a mothur file.}
+\usage{
+show_mothur_cutoffs(mothur_list_file)
+}
+\arguments{
+\item{mothur_list_file}{The file name and/or location as produced by \emph{mothur}.}
+}
+\value{
+A character vector of the different cutoff values contained in the file.
+ For a given set of arguments to the \code{cluster()} command from within
+ \emph{mothur}, a number of OTU-clustering results are returned in the same
+ file. The exact cutoff values used by \emph{mothur} can vary depending
+ on the input data/parameters. This simple function returns the cutoffs that were actually
+ included in the \emph{mothur} output. This an important extra step prior to
+ importing data with the \code{\link{import_mothur}} function.
+}
+\description{
+This is a helper function to report back to the user the different cutoff
+values available in a given mothur file --
+for instance, a list or shared file.
+}
+\seealso{
+\code{\link{import_mothur}}
+}
+
diff --git a/man/splat.phyloseq.objects.Rd b/man/splat.phyloseq.objects.Rd
new file mode 100644
index 0000000..c9153ba
--- /dev/null
+++ b/man/splat.phyloseq.objects.Rd
@@ -0,0 +1,36 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/phyloseq-class.R
+\name{splat.phyloseq.objects}
+\alias{splat.phyloseq.objects}
+\title{Convert \code{\link{phyloseq-class}} into a named list of its non-empty components.}
+\usage{
+splat.phyloseq.objects(x)
+}
+\arguments{
+\item{x}{A \code{\link{phyloseq-class}} object. Alternatively, a component
+data object will work, resulting in named list of length 1.}
+}
+\value{
+A named list, where each element is a component object that was contained
+in the argument, \code{x}. Each element is named according to its slot-name in
+the phyloseq-object from which it is derived.
+If \code{x} is already a component data object,
+then a list of length (1) is returned, also named.
+}
+\description{
+This is used in internal handling functions, and one of its key features
+is that the names in the returned-list match the slot-names, which is useful
+for constructing calls with language-computing functions like \code{\link{do.call}}.
+Another useful aspect is that it only returns the contents of non-empty slots.
+In general, this should only be used by phyloseq-package developers. Standard
+users should not need or use this function, and should use the accessors and
+other tools that leave the multi-component object in one piece.
+}
+\examples{
+#
+}
+\seealso{
+merge_phyloseq
+}
+\keyword{internal}
+
diff --git a/man/subset_ord_plot.Rd b/man/subset_ord_plot.Rd
new file mode 100644
index 0000000..318cfca
--- /dev/null
+++ b/man/subset_ord_plot.Rd
@@ -0,0 +1,70 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/plot-methods.R
+\name{subset_ord_plot}
+\alias{subset_ord_plot}
+\title{Subset points from an ordination-derived ggplot}
+\usage{
+subset_ord_plot(p, threshold=0.05, method="farthest")
+}
+\arguments{
+\item{p}{(Required). A \code{\link{ggplot}} object created by
+\code{\link{plot_ordination}}. It contains the complete data that you
+want to subset.}
+
+\item{threshold}{(Optional). A numeric scalar. Default is \code{0.05}.
+This value determines a coordinate threshold or population threshold,
+depending on the value of the \code{method} argument, ultimately
+determining which points are included in returned \code{data.frame}.}
+
+\item{method}{(Optional). A character string. One of
+ \code{c("farthest", "radial", "square")}. Default is \code{"farthest"}.
+ This determines how threshold will be interpreted.
+
+\describe{
+
+ \item{farthest}{
+ Unlike the other two options, this option implies removing a
+ certain fraction or number of points from the plot, depending
+ on the value of \code{threshold}. If \code{threshold} is greater
+ than or equal to \code{1}, then all but \code{threshold} number
+ of points farthest from the origin are removed. Otherwise, if
+ \code{threshold} is less than \code{1}, all but \code{threshold}
+ fraction of points farthests from origin are retained.
+ }
+
+ \item{radial}{
+ Keep only those points that are beyond \code{threshold}
+ radial distance from the origin. Has the effect of removing a
+ circle of points from the plot, centered at the origin.
+ }
+
+ \item{square}{
+ Keep only those points with at least one coordinate
+ greater than \code{threshold}. Has the effect of removing a
+ ``square'' of points from the plot, centered at the origin.
+ }
+
+ }}
+}
+\value{
+A \code{\link{data.frame}} suitable for creating a
+ \code{\link{ggplot}} plot object, graphically summarizing
+ the ordination result according to previously-specified parameters.
+}
+\description{
+Easily retrieve a plot-derived \code{data.frame} with a subset of points
+according to a threshold and method. The meaning of the threshold depends
+upon the method. See argument description below.
+There are many useful examples of phyloseq ordination graphics in the
+\href{http://joey711.github.io/phyloseq/subset_ord_plot-examples}{phyloseq online tutorials}.
+}
+\examples{
+## See the online tutorials.
+## http://joey711.github.io/phyloseq/subset_ord_plot-examples
+}
+\seealso{
+\href{http://joey711.github.io/phyloseq/subset_ord_plot-examples}{phyloseq online tutorial} for this function.
+
+ \code{\link{plot_ordination}}
+}
+
diff --git a/man/subset_samples-methods.Rd b/man/subset_samples-methods.Rd
new file mode 100644
index 0000000..d8a6170
--- /dev/null
+++ b/man/subset_samples-methods.Rd
@@ -0,0 +1,43 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/sampleData-class.R
+\docType{methods}
+\name{subset_samples}
+\alias{subset_samples}
+\title{Subset samples by sample_data expression}
+\usage{
+subset_samples(physeq, ...)
+}
+\arguments{
+\item{physeq}{A \code{\link{sample_data-class}}, or a \code{\link{phyloseq-class}}
+object with a
+\code{sample_data}. If the \code{sample_data} slot is missing in \code{physeq},
+then \code{physeq} will be returned as-is, and a warning will be printed to screen.}
+
+\item{...}{The subsetting expression that should be applied to the
+\code{sample_data}. This is passed on to \code{\link{subset}}, see its
+documentation for more details.}
+}
+\value{
+A subsetted object with the same class as \code{physeq}.
+}
+\description{
+This is a convenience wrapper around the \code{\link{subset}} function.
+It is intended to allow subsetting complex experimental objects with one
+function call.
+Subsetting is based on an expression for which the context first includes
+the variables contained in \code{\link{sample_data}}.
+The \code{samples} retained in the dataset is equivalent to
+\code{x[subset & !is.na(subset)]}, where \code{x} is the vector of sample IDs
+and \code{subset} is the logical that results from your subsetting expression.
+This is important to keep in mind, as users are often unaware that this
+subsetting step also removes/omits samples that have a missing value, \code{NA},
+somewhere in the expression.
+}
+\examples{
+ # data(GlobalPatterns)
+ # subset_samples(GlobalPatterns, SampleType=="Ocean")
+}
+\seealso{
+\code{\link{subset_species}}
+}
+
diff --git a/man/subset_taxa-methods.Rd b/man/subset_taxa-methods.Rd
new file mode 100644
index 0000000..1e81f8b
--- /dev/null
+++ b/man/subset_taxa-methods.Rd
@@ -0,0 +1,41 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/taxonomyTable-class.R
+\docType{methods}
+\name{subset_taxa}
+\alias{subset_taxa}
+\title{Subset species by taxonomic expression}
+\usage{
+subset_taxa(physeq, ...)
+}
+\arguments{
+\item{physeq}{A \code{\link{taxonomyTable-class}}, or \code{\link{phyloseq-class}} that contains a
+taxonomyTable. If the \code{tax_table} slot is missing in \code{physeq}, then \code{physeq}
+will be returned as-is and a warning will be printed to screen.}
+
+\item{...}{The subsetting expression that should be applied to the
+\code{taxonomyTable}. This is passed on to \code{\link{subset}}, and more
+details and examples about how it functions can be found in its documentation.}
+}
+\value{
+A subsetted object with the same class as \code{physeq}.
+}
+\description{
+This is a convenience wrapper around the \code{\link{subset}} function.
+It is intended to speed subsetting complex experimental objects with one
+function call. In the case of \code{subset_taxa}, the subsetting will be
+based on an expression related to the columns and values within the
+\code{tax_table} (\code{taxonomyTable} component) slot of \code{physeq}.
+The \code{OTUs} retained in the dataset is equivalent to
+\code{x[subset & !is.na(subset)]}, where \code{x} is the vector of OTU IDs
+and \code{subset} is the logical that results from your subsetting expression.
+This is important to keep in mind, as users are often unaware that this
+subsetting step also removes/omits OTUs that have a missing value result, \code{NA},
+somewhere in the expression.
+}
+\examples{
+## ex3 <- subset_taxa(GlobalPatterns, Phylum=="Bacteroidetes")
+}
+\seealso{
+\code{\link{subset_samples}}
+}
+
diff --git a/man/tax_glom.Rd b/man/tax_glom.Rd
new file mode 100644
index 0000000..92e9d98
--- /dev/null
+++ b/man/tax_glom.Rd
@@ -0,0 +1,73 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/transform_filter-methods.R
+\name{tax_glom}
+\alias{tax_glom}
+\title{Agglomerate taxa of the same type.}
+\usage{
+tax_glom(physeq, taxrank=rank_names(physeq)[1], NArm=TRUE, bad_empty=c(NA, "", " ", "\t"))
+}
+\arguments{
+\item{physeq}{(Required). \code{\link{phyloseq-class}} or \code{\link{otu_table}}.}
+
+\item{taxrank}{A character string specifying the taxonomic level
+that you want to agglomerate over.
+Should be among the results of \code{rank_names(physeq)}.
+The default value is \code{rank_names(physeq)[1]},
+which may agglomerate too broadly for a given experiment.
+You are strongly encouraged to try different values for this argument.}
+
+\item{NArm}{(Optional). Logical, length equal to one. Default is \code{TRUE}.
+CAUTION. The decision to prune (or not) taxa for which you lack categorical
+data could have a large effect on downstream analysis. You may want to
+re-compute your analysis under both conditions, or at least think carefully
+about what the effect might be and the reasons explaining the absence of
+information for certain taxa. In the case of taxonomy, it is often a result
+of imprecision in taxonomic designation based on short phylogenetic sequences
+and a patchy system of nomenclature. If this seems to be an issue for your
+analysis, think about also trying the nomenclature-agnostic \code{\link{tip_glom}}
+method if you have a phylogenetic tree available.}
+
+\item{bad_empty}{(Optional). Character vector. Default: \code{c(NA, "", " ", "\t")}.
+Defines the bad/empty values
+that should be ignored and/or considered unknown. They will be removed
+from the internal agglomeration vector derived from the argument to \code{tax},
+and therefore agglomeration will not combine taxa according to the presence
+of these values in \code{tax}. Furthermore, the corresponding taxa can be
+optionally pruned from the output if \code{NArm} is set to \code{TRUE}.}
+}
+\value{
+A taxonomically-agglomerated, optionally-pruned, object with class matching
+the class of \code{physeq}.
+}
+\description{
+This method merges species that have the same taxonomy at a certain
+taxaonomic rank.
+Its approach is analogous to \code{\link{tip_glom}}, but uses categorical data
+instead of a tree. In principal, other categorical data known for all taxa
+could also be used in place of taxonomy,
+but for the moment, this must be stored in the \code{taxonomyTable}
+of the data. Also, columns/ranks to the right of the rank chosen to use
+for agglomeration will be replaced with \code{NA},
+because they should be meaningless following agglomeration.
+}
+\examples{
+# data(GlobalPatterns)
+# ## print the available taxonomic ranks
+# colnames(tax_table(GlobalPatterns))
+# ## agglomerate at the Family taxonomic rank
+# (x1 <- tax_glom(GlobalPatterns, taxrank="Family") )
+# ## How many taxa before/after agglomeration?
+# ntaxa(GlobalPatterns); ntaxa(x1)
+# ## Look at enterotype dataset...
+# data(enterotype)
+# ## print the available taxonomic ranks. Shows only 1 rank available, not useful for tax_glom
+# colnames(tax_table(enterotype))
+}
+\seealso{
+\code{\link{tip_glom}}
+
+\code{\link{prune_taxa}}
+
+\code{\link{merge_taxa}}
+}
+
diff --git a/man/tax_table-methods.Rd b/man/tax_table-methods.Rd
new file mode 100644
index 0000000..b549b98
--- /dev/null
+++ b/man/tax_table-methods.Rd
@@ -0,0 +1,57 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/taxonomyTable-class.R
+\docType{methods}
+\name{tax_table}
+\alias{tax_table}
+\alias{tax_table,ANY-method}
+\alias{tax_table,data.frame-method}
+\alias{tax_table,matrix-method}
+\title{Build or access the taxonomyTable.}
+\usage{
+tax_table(object, errorIfNULL=TRUE)
+
+\S4method{tax_table}{ANY}(object, errorIfNULL = TRUE)
+
+\S4method{tax_table}{matrix}(object)
+
+\S4method{tax_table}{data.frame}(object)
+}
+\arguments{
+\item{object}{An object among the set of classes defined by the phyloseq
+package that contain taxonomyTable.}
+
+\item{errorIfNULL}{(Optional). Logical. Should the accessor stop with
+an error if the slot is empty (\code{NULL})? Default \code{TRUE}.}
+}
+\value{
+A \code{\link{taxonomyTable-class}} object.
+It is either grabbed from the relevant slot
+if \code{object} is complex, or built anew if \code{object} is a
+character matrix representing the taxonomic classification of
+species in the experiment.
+}
+\description{
+This is the suggested method for both constructing and accessing a table of
+taxonomic names, organized with ranks as columns (\code{\link{taxonomyTable-class}}).
+When the argument is a character matrix, tax_table() will create and return a
+\code{\link{taxonomyTable-class}} object.
+In this case, the rows should be named to match the
+\code{species.names} of the other objects to which it will ultimately be paired.
+Alternatively, if the first argument is an experiment-level (\code{\link{phyloseq-class}})
+object, then the corresponding \code{taxonomyTable} is returned.
+Like other accessors (see See Also, below), the default behavior of this method
+is to stop with an
+error if \code{object} is a \code{phyloseq-class} but does not
+contain a \code{taxonomyTable}.
+}
+\examples{
+#
+# tax1 <- tax_table(matrix("abc", 30, 8))
+# data(GlobalPatterns)
+# tax_table(GlobalPatterns)
+}
+\seealso{
+\code{\link{phy_tree}}, \code{\link{sample_data}}, \code{\link{otu_table}}
+ \code{\link{phyloseq}}, \code{\link{merge_phyloseq}}
+}
+
diff --git a/man/taxa_are_rows-methods.Rd b/man/taxa_are_rows-methods.Rd
new file mode 100644
index 0000000..29966fa
--- /dev/null
+++ b/man/taxa_are_rows-methods.Rd
@@ -0,0 +1,31 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/almostAllAccessors.R
+\docType{methods}
+\name{taxa_are_rows}
+\alias{taxa_are_rows}
+\alias{taxa_are_rows,ANY-method}
+\alias{taxa_are_rows,otu_table-method}
+\alias{taxa_are_rows,phyloseq-method}
+\title{Access taxa_are_rows slot from otu_table objects.}
+\usage{
+taxa_are_rows(physeq)
+
+\S4method{taxa_are_rows}{ANY}(physeq)
+
+\S4method{taxa_are_rows}{otu_table}(physeq)
+
+\S4method{taxa_are_rows}{phyloseq}(physeq)
+}
+\arguments{
+\item{physeq}{(Required). \code{\link{phyloseq-class}}, or \code{\link{otu_table-class}}.}
+}
+\value{
+A logical indicating the orientation of the otu_table.
+}
+\description{
+Access taxa_are_rows slot from otu_table objects.
+}
+\seealso{
+\code{\link{otu_table}}
+}
+
diff --git a/man/taxa_names-methods.Rd b/man/taxa_names-methods.Rd
new file mode 100644
index 0000000..aa0d8bc
--- /dev/null
+++ b/man/taxa_names-methods.Rd
@@ -0,0 +1,55 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/almostAllAccessors.R
+\docType{methods}
+\name{taxa_names}
+\alias{taxa_names}
+\alias{taxa_names,ANY-method}
+\alias{taxa_names,XStringSet-method}
+\alias{taxa_names,otu_table-method}
+\alias{taxa_names,phylo-method}
+\alias{taxa_names,phyloseq-method}
+\alias{taxa_names,sample_data-method}
+\alias{taxa_names,taxonomyTable-method}
+\title{Get species / taxa names.}
+\usage{
+taxa_names(physeq)
+
+\S4method{taxa_names}{ANY}(physeq)
+
+\S4method{taxa_names}{phyloseq}(physeq)
+
+\S4method{taxa_names}{otu_table}(physeq)
+
+\S4method{taxa_names}{taxonomyTable}(physeq)
+
+\S4method{taxa_names}{sample_data}(physeq)
+
+\S4method{taxa_names}{phylo}(physeq)
+
+\S4method{taxa_names}{XStringSet}(physeq)
+}
+\arguments{
+\item{physeq}{\code{\link{phyloseq-class}}, \code{\link{otu_table-class}},
+\code{\link{taxonomyTable-class}}, or
+\code{\link[ape]{phylo}}}
+}
+\value{
+A character vector of the names of the species in \code{physeq}.
+}
+\description{
+Get species / taxa names.
+}
+\examples{
+#
+data("esophagus")
+tree <- phy_tree(esophagus)
+OTU1 <- otu_table(esophagus)
+taxa_names(tree)
+taxa_names(OTU1)
+physeq1 <- phyloseq(OTU1, tree)
+taxa_names(physeq1)
+}
+\seealso{
+ntaxa
+}
+
diff --git a/man/taxa_sums.Rd b/man/taxa_sums.Rd
new file mode 100644
index 0000000..cea644e
--- /dev/null
+++ b/man/taxa_sums.Rd
@@ -0,0 +1,30 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/otuTable-class.R
+\name{taxa_sums}
+\alias{taxa_sums}
+\title{Returns the total number of individuals observed from each species/taxa/OTU.}
+\usage{
+taxa_sums(x)
+}
+\arguments{
+\item{x}{\code{\link{otu_table-class}}, or \code{\link{phyloseq-class}}.}
+}
+\value{
+A \code{\link{numeric-class}} with length equal to the number of species
+ in the table, name indicated the taxa ID, and value equal to the sum of
+ all individuals observed for each taxa in \code{x}.
+}
+\description{
+A convenience function equivalent to rowSums or colSums, but where
+the orientation of the otu_table is automatically handled.
+}
+\examples{
+data(enterotype)
+taxa_sums(enterotype)
+data(esophagus)
+taxa_sums(esophagus)
+}
+\seealso{
+\code{\link{sample_sums}}, \code{\link{rowSums}}, \code{\link{colSums}}
+}
+
diff --git a/man/taxonomyTable-class.Rd b/man/taxonomyTable-class.Rd
new file mode 100644
index 0000000..9821901
--- /dev/null
+++ b/man/taxonomyTable-class.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/allClasses.R
+\docType{class}
+\name{taxonomyTable-class}
+\alias{taxonomyTable-class}
+\title{An S4 class that holds taxonomic classification data as a character
+matrix.}
+\description{
+Row indices represent taxa, columns represent taxonomic classifiers.
+}
+\details{
+\describe{
+ \item{.Data}{This slot is inherited from the \code{\link{matrix}} class.}
+}
+}
+
diff --git a/man/threshrank.Rd b/man/threshrank.Rd
new file mode 100644
index 0000000..c5c4e4c
--- /dev/null
+++ b/man/threshrank.Rd
@@ -0,0 +1,43 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/transform_filter-methods.R
+\name{threshrank}
+\alias{threshrank}
+\title{Thresholded rank transformation.}
+\usage{
+threshrank(x, thresh, keep0s=FALSE, ...)
+}
+\arguments{
+\item{x}{(Required). Numeric vector to transform.}
+
+\item{thresh}{A single numeric value giving the threshold.}
+
+\item{keep0s}{A logical determining whether 0's in \code{x} should remain
+a zero-value in the output. If FALSE, zeros are treated as any other value.}
+
+\item{...}{Further arguments passes to the \code{\link{rank}} function.}
+}
+\value{
+A ranked, (optionally) thresholded numeric vector with length equal to
+ \code{x}. Default arguments to \code{rank} are used, unless provided as
+ additional arguments.
+}
+\description{
+The lowest \code{thresh} values in \code{x} all get the value 'thresh'.
+}
+\examples{
+#
+(a_vector <- sample(0:10, 100, TRUE))
+threshrank(a_vector, 5, keep0s=TRUE)
+data(GlobalPatterns)
+GP <- GlobalPatterns
+## These three approaches result in identical otu_table
+(x1 <- transform_sample_counts( otu_table(GP), threshrankfun(500)) )
+(x2 <- otu_table(apply(otu_table(GP), 2, threshrankfun(500)), taxa_are_rows(GP)) )
+identical(x1, x2)
+(x3 <- otu_table(apply(otu_table(GP), 2, threshrank, thresh=500), taxa_are_rows(GP)) )
+identical(x1, x3)
+}
+\seealso{
+\code{\link{transform_sample_counts}}, \code{\link{rank}}, \code{\link{threshrankfun}}
+}
+
diff --git a/man/threshrankfun.Rd b/man/threshrankfun.Rd
new file mode 100644
index 0000000..60abf7a
--- /dev/null
+++ b/man/threshrankfun.Rd
@@ -0,0 +1,38 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/transform_filter-methods.R
+\name{threshrankfun}
+\alias{threshrankfun}
+\title{A closure version of the \code{threshrank} function.}
+\usage{
+threshrankfun(thresh, keep0s=FALSE, ...)
+}
+\arguments{
+\item{thresh}{A single numeric value giving the threshold.}
+
+\item{keep0s}{A logical determining whether 0's in \code{x} should remain
+a zero-value in the output. If FALSE, zeros are treated as any other value.}
+
+\item{...}{Further arguments passes to the \code{\link{rank}} function.}
+}
+\value{
+A single-argument function with the options to \code{\link{threshrank}} set.
+}
+\description{
+Takes the same arguments as \code{\link{threshrank}}, except for \code{x},
+because the output is a single-argument function rather than a rank-transformed numeric.
+This is useful for higher-order functions that require a single-argument function as input,
+like \code{\link{transform_sample_counts}}.
+}
+\examples{
+data(esophagus)
+x1 = transform_sample_counts(esophagus, threshrankfun(50))
+otu_table(x1)
+x2 = transform_sample_counts(esophagus, rank)
+otu_table(x2)
+identical(x1, x2)
+}
+\seealso{
+\code{\link{transform_sample_counts}}, \code{\link{threshrankfun}},
+ \code{\link{threshrank}}
+}
+
diff --git a/man/tip_glom.Rd b/man/tip_glom.Rd
new file mode 100644
index 0000000..f11adf2
--- /dev/null
+++ b/man/tip_glom.Rd
@@ -0,0 +1,70 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/transform_filter-methods.R
+\name{tip_glom}
+\alias{tip_glom}
+\title{Agglomerate closely-related taxa using single-linkage clustering.}
+\usage{
+tip_glom(physeq, h = 0.2, hcfun = agnes, ...)
+}
+\arguments{
+\item{physeq}{(Required). A \code{\link{phyloseq-class}},
+containing a phylogenetic tree.
+Alternatively, a phylogenetic tree \code{\link[ape]{phylo}} will also work.}
+
+\item{h}{(Optional). Numeric scalar of the height where the tree should be cut.
+This refers to the tree resulting from hierarchical clustering
+of \code{\link[ape]{cophenetic.phylo}(phy_tree(physeq))},
+not necessarily the original phylogenetic tree, \code{phy_tree(physeq)}.
+Default value is \code{0.2}.
+Note that this argument used to be named \code{speciationMinLength},
+before this function/method was rewritten.}
+
+\item{hcfun}{(Optional). A function.
+The (agglomerative, hierarchical) clustering function to use.
+Good examples are
+\code{\link[cluster]{agnes}} and \code{\link[stats]{hclust}}.
+The default is \code{\link[cluster]{agnes}}.}
+
+\item{...}{(Optional). Additional named arguments to pass
+to \code{hcfun}.}
+}
+\value{
+An instance of the \code{\link{phyloseq-class}}.
+ Or alternatively, a \code{\link{phylo}} object if the
+ \code{physeq} argument was just a tree.
+ In the expected-use case, the number of OTUs will be fewer
+ (see \code{\link{ntaxa}}),
+ after merging OTUs that are related enough to be called
+ the same OTU.
+}
+\description{
+All tips of the tree separated by a cophenetic distance smaller than
+\code{h} will be agglomerated into one taxa using \code{\link{merge_taxa}}.
+}
+\details{
+Can be used to create a non-trivial OTU Table, if a phylogenetic tree is available.
+
+For now, a simple, ``greedy'', single-linkage clustering is used. In future releases
+it should be possible to specify different clustering approaches available in \code{R},
+in particular, complete-linkage clustering appears to be used more commonly for OTU
+clustering applications.
+}
+\examples{
+data("esophagus")
+# for speed
+esophagus = prune_taxa(taxa_names(esophagus)[1:25], esophagus)
+plot_tree(esophagus, label.tips="taxa_names", size="abundance", title="Before tip_glom()")
+plot_tree(tip_glom(esophagus, h=0.2), label.tips="taxa_names", size="abundance", title="After tip_glom()")
+}
+\seealso{
+\code{\link{merge_taxa}}
+
+\code{\link[cluster]{agnes}}
+
+\code{\link[stats]{hclust}}
+
+\code{\link[ape]{cophenetic.phylo}}
+
+\code{\link[ape]{phylo}}
+}
+
diff --git a/man/topf.Rd b/man/topf.Rd
new file mode 100644
index 0000000..6640290
--- /dev/null
+++ b/man/topf.Rd
@@ -0,0 +1,42 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/transform_filter-methods.R
+\name{topf}
+\alias{topf}
+\title{Make filter fun. that returns the top f fraction of taxa in a sample.}
+\usage{
+topf(f, na.rm=TRUE)
+}
+\arguments{
+\item{f}{Single numeric value between 0 and 1.}
+
+\item{na.rm}{Logical. Should we remove NA values. Default \code{TRUE}.}
+}
+\value{
+A function (enclosure), suitable for \code{\link{filterfun_sample}},
+ that will return \code{TRUE}
+ for each element in the taxa comprising the most abundant f fraction of individuals.
+}
+\description{
+As opposed to \code{\link{topp}}, which gives the
+most abundant p fraction of observed taxa (richness, instead of cumulative
+abundance. Said another way, topf ensures a certain
+fraction of the total sequences are retained, while topp ensures
+that a certain fraction of taxa/species/OTUs are retained.
+}
+\examples{
+t1 <- 1:10; names(t1)<-paste("t", 1:10, sep="")
+topf(0.6)(t1)
+## Use simulated abundance matrix
+set.seed(711)
+testOTU <- otu_table(matrix(sample(1:50, 25, replace=TRUE), 5, 5), taxa_are_rows=FALSE)
+f1 <- filterfun_sample(topf(0.4))
+(wh1 <- genefilter_sample(testOTU, f1, A=1))
+wh2 <- c(TRUE, TRUE, TRUE, FALSE, FALSE)
+prune_taxa(wh1, testOTU)
+prune_taxa(wh2, testOTU)
+}
+\seealso{
+\code{\link{topk}}, \code{\link{topf}},
+ \code{\link{topp}}, \code{\link{rm_outlierf}}
+}
+
diff --git a/man/topk.Rd b/man/topk.Rd
new file mode 100644
index 0000000..3868ad8
--- /dev/null
+++ b/man/topk.Rd
@@ -0,0 +1,36 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/transform_filter-methods.R
+\name{topk}
+\alias{topk}
+\title{Make filter fun. the most abundant \code{k} taxa}
+\usage{
+topk(k, na.rm=TRUE)
+}
+\arguments{
+\item{k}{An integer, indicating how many of the most abundant taxa
+should be kept.}
+
+\item{na.rm}{A logical. Should \code{NA}s be removed. Default is \code{TRUE}.}
+}
+\value{
+Returns a function (enclosure) that will return TRUE
+ for each element in the most abundant k values.
+}
+\description{
+Make filter fun. the most abundant \code{k} taxa
+}
+\examples{
+## Use simulated abundance matrix
+set.seed(711)
+testOTU <- otu_table(matrix(sample(1:50, 25, replace=TRUE), 5, 5), taxa_are_rows=FALSE)
+f1 <- filterfun_sample(topk(2))
+wh1 <- genefilter_sample(testOTU, f1, A=2)
+wh2 <- c(TRUE, TRUE, TRUE, FALSE, FALSE)
+prune_taxa(wh1, testOTU)
+prune_taxa(wh2, testOTU)
+}
+\seealso{
+\code{\link{topk}}, \code{\link{topf}},
+ \code{\link{topp}}, \code{\link{rm_outlierf}}
+}
+
diff --git a/man/topp.Rd b/man/topp.Rd
new file mode 100644
index 0000000..44b48e5
--- /dev/null
+++ b/man/topp.Rd
@@ -0,0 +1,38 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/transform_filter-methods.R
+\name{topp}
+\alias{topp}
+\title{Make filter fun. that returns the most abundant \code{p} fraction of taxa}
+\usage{
+topp(p, na.rm=TRUE)
+}
+\arguments{
+\item{p}{A numeric of length 1, indicating what fraction of the most abundant taxa
+should be kept.}
+
+\item{na.rm}{A logical. Should \code{NA}s be removed. Default is \code{TRUE}.}
+}
+\value{
+A function (enclosure), suitable for \code{\link{filterfun_sample}},
+ that will return \code{TRUE}
+ for each element in the most abundant p fraction of taxa.
+}
+\description{
+Make filter fun. that returns the most abundant \code{p} fraction of taxa
+}
+\examples{
+## Use simulated abundance matrix
+set.seed(711)
+testOTU <- otu_table(matrix(sample(1:50, 25, replace=TRUE), 5, 5), taxa_are_rows=FALSE)
+sample_sums(testOTU)
+f1 <- filterfun_sample(topp(0.2))
+(wh1 <- genefilter_sample(testOTU, f1, A=1))
+wh2 <- c(TRUE, TRUE, TRUE, FALSE, FALSE)
+prune_taxa(wh1, testOTU)
+prune_taxa(wh2, testOTU)
+}
+\seealso{
+\code{\link{topk}}, \code{\link{topf}},
+ \code{\link{topp}}, \code{\link{rm_outlierf}}
+}
+
diff --git a/man/transformcounts.Rd b/man/transformcounts.Rd
new file mode 100644
index 0000000..c526b21
--- /dev/null
+++ b/man/transformcounts.Rd
@@ -0,0 +1,55 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/transform_filter-methods.R
+\docType{methods}
+\name{transform_sample_counts}
+\alias{transformSampleCounts}
+\alias{transform_sample_counts}
+\title{Transform abundance data in an \code{otu_table}, sample-by-sample.}
+\usage{
+transform_sample_counts(physeq, fun, ...)
+
+transformSampleCounts(physeq, fun, ...)
+}
+\arguments{
+\item{physeq}{(Required). \code{\link{phyloseq-class}} of \code{\link{otu_table-class}}.}
+
+\item{fun}{(Required). A single-argument function that will be applied
+to the abundance counts of each sample.
+Can be an anonymous \code{\link[base]{function}}.}
+
+\item{...}{(Optional). Additional, optionally-named, arguments passed to
+\code{fun} during transformation of abundance data.}
+}
+\value{
+A transformed \code{otu_table} -- or \code{phyloseq} object with its
+ transformed \code{otu_table}.
+ In general, trimming is not expected by this
+ method, so it is suggested that the user provide only functions that return
+ a full-length vector. Filtering/trimming can follow, for which the
+ \code{\link{genefilter_sample}} and \code{\link{prune_taxa}} functions
+ are suggested.
+}
+\description{
+This function transforms the sample counts of a taxa
+abundance matrix according to a user-provided function.
+The counts of each sample will be transformed individually. No sample-sample
+interaction/comparison is possible by this method.
+}
+\examples{
+#
+data(esophagus)
+x1 = transform_sample_counts(esophagus, threshrankfun(50))
+head(otu_table(x1), 10)
+x2 = transform_sample_counts(esophagus, rank)
+head(otu_table(x2), 10)
+identical(x1, x2)
+x3 = otu_table(esophagus) + 5
+x3 = transform_sample_counts(x3, log)
+head(otu_table(x3), 10)
+x4 = transform_sample_counts(esophagus, function(x) round(x^2.2, 0))
+head(otu_table(x4), 10)
+}
+\seealso{
+\code{\link{threshrankfun}}, \code{\link{rank}}, \code{\link{log}}
+}
+
diff --git a/man/transpose-methods.Rd b/man/transpose-methods.Rd
new file mode 100644
index 0000000..a99a674
--- /dev/null
+++ b/man/transpose-methods.Rd
@@ -0,0 +1,32 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/transform_filter-methods.R
+\docType{methods}
+\name{t}
+\alias{t}
+\alias{t,otu_table-method}
+\alias{t,phyloseq-method}
+\title{Transpose \code{\link{otu_table-class}} or \code{\link{phyloseq-class}}}
+\usage{
+t(x)
+
+\S4method{t}{otu_table}(x)
+
+\S4method{t}{phyloseq}(x)
+}
+\arguments{
+\item{x}{An \code{otu_table} or \code{\link{phyloseq-class}}.}
+}
+\value{
+The class of the object returned by \code{t} matches
+the class of the argument, \code{x}. The \code{otu_table} is
+transposed, and \code{\link{taxa_are_rows}} value is toggled.
+}
+\description{
+Extends the base transpose method, \code{\link[base]{t}}.
+}
+\examples{
+data(GlobalPatterns)
+otu_table(GlobalPatterns)
+t( otu_table(GlobalPatterns) )
+}
+
diff --git a/man/tree_layout.Rd b/man/tree_layout.Rd
new file mode 100644
index 0000000..f1879ae
--- /dev/null
+++ b/man/tree_layout.Rd
@@ -0,0 +1,67 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/plot-methods.R
+\name{tree_layout}
+\alias{tree_layout}
+\title{Returns a data table defining the line segments of a phylogenetic tree.}
+\usage{
+tree_layout(phy, ladderize = FALSE)
+}
+\arguments{
+\item{phy}{(Required). The \code{\link{phylo}} or \code{\link{phyloseq-class}}
+object (which must contain a \code{\link{phylo}}genetic tree)
+that you want to converted to \code{\link{data.table}}s
+suitable for plotting with \code{\link[ggplot2]{ggplot}}2.}
+
+\item{ladderize}{(Optional). Boolean or character string (either
+\code{FALSE}, \code{TRUE}, or \code{"left"}).
+Default is \code{FALSE} (no ladderization).
+This parameter specifies whether or not to \code{\link[ape]{ladderize}} the tree
+(i.e., reorder nodes according to the depth of their enclosed
+subtrees) prior to plotting.
+This tends to make trees more aesthetically pleasing and legible in
+a graphical display.
+When \code{TRUE} or \code{"right"}, ``right'' ladderization is used.
+When set to \code{FALSE}, no ladderization is applied.
+When set to \code{"left"}, the reverse direction
+(``left'' ladderization) is applied.}
+}
+\value{
+A list of two \code{\link{data.table}}s, containing respectively
+ a \code{data.table} of edge segment coordinates, named \code{edgeDT},
+ and a \code{data.table} of vertical connecting segments, named \code{vertDT}.
+ See \code{example} below for a simple demonstration.
+}
+\description{
+This function takes a \code{\link{phylo}} or \code{\link{phyloseq-class}} object
+and returns a list of two \code{\link{data.table}}s suitable for plotting
+a phylogenetic tree with \code{\link[ggplot2]{ggplot}}2.
+}
+\examples{
+library("ggplot2")
+data("esophagus")
+phy = phy_tree(esophagus)
+phy <- ape::root(phy, "65_2_5", resolve.root=TRUE)
+treeSegs0 = tree_layout(phy)
+treeSegs1 = tree_layout(esophagus)
+edgeMap = aes(x=xleft, xend=xright, y=y, yend=y)
+vertMap = aes(x=x, xend=x, y=vmin, yend=vmax)
+p0 = ggplot(treeSegs0$edgeDT, edgeMap) + geom_segment() + geom_segment(vertMap, data=treeSegs0$vertDT)
+p1 = ggplot(treeSegs1$edgeDT, edgeMap) + geom_segment() + geom_segment(vertMap, data=treeSegs1$vertDT)
+print(p0)
+print(p1)
+plot_tree(esophagus, "treeonly")
+plot_tree(esophagus, "treeonly", ladderize="left")
+}
+\seealso{
+An early example of this functionality was borrowed directly, with permission,
+from the package called \code{ggphylo},
+released on GitHub at:
+\url{https://github.com/gjuggler/ggphylo}
+by its author Gregory Jordan \email{gjuggler at gmail.com}.
+That original phyloseq internal function, \code{tree.layout}, has been
+completely replaced by this smaller and much faster user-accessible
+function that utilizes performance enhancements from standard
+\code{\link{data.table}} magic as well as \code{\link{ape-package}}
+internal C code.
+}
+
diff --git a/tests/testthat-phyloseq.R b/tests/testthat-phyloseq.R
new file mode 100644
index 0000000..3743c35
--- /dev/null
+++ b/tests/testthat-phyloseq.R
@@ -0,0 +1,13 @@
+library("testthat")
+packageVersion("phyloseq")
+# As suggested for opt-out option on testing by users, recommended by CRAN
+# http://adv-r.had.co.nz/Testing.html
+# Previously, best practice was to put all test files in inst/tests and ensure that R CMD check ran them by putting the following code in tests/test-all.R:
+# library(testthat)
+# library(yourpackage)
+# test_package("yourpackage")
+# Now, recommended practice is to put your tests in tests/testthat, and ensure R CMD check runs them by putting the following code in tests/test-all.R:
+# library(testthat)
+# test_check("yourpackage")
+# The advantage of this new structure is that the user has control over whether or not tests are installed using the –install-tests parameter to R CMD install, or INSTALL_opts = c(“–install-tests”) argument to install.packages(). I’m not sure why you wouldn’t want to install the tests, but now you have the flexibility as requested by CRAN maintainers.
+test_check("phyloseq")
diff --git a/tests/testthat/test-IO.R b/tests/testthat/test-IO.R
new file mode 100644
index 0000000..25be08e
--- /dev/null
+++ b/tests/testthat/test-IO.R
@@ -0,0 +1,379 @@
+################################################################################
+# Use testthat to test file import and resulting class (and values)
+################################################################################
+library("phyloseq"); library("testthat")
+# # # # TESTS!
+
+################################################################################
+# import_mothur tests
+mothlist <- system.file("extdata", "esophagus.fn.list.gz", package="phyloseq")
+mothgroup <- system.file("extdata", "esophagus.good.groups.gz", package="phyloseq")
+mothtree <- system.file("extdata", "esophagus.tree.gz", package="phyloseq")
+cutoff <- "0.10"
+esophman <- import_mothur(mothlist, mothgroup, mothtree, cutoff)
+# mothur "Shared" file, create with mothur from these example data files
+mothshared = system.file("extdata", "esophagus.fn.shared.gz", package="phyloseq")
+constaxonomy = system.file("extdata", "mothur_example.cons.taxonomy.gz", package="phyloseq")
+
+test_that("import_mothur: import of esophagus dataset from mothur files in extdata/ produces a phyloseq object", {
+ expect_that(esophman, is_a("phyloseq"))
+})
+
+test_that("import_mothur: The two phyloseq objects, example and just-imported, are identical", {
+ data("esophagus")
+ expect_that(esophagus, is_equivalent_to(esophman))
+})
+
+test_that("import_mothur: Test mothur file import on the (esophagus data).", {
+ smlc <- show_mothur_cutoffs(mothlist)
+ expect_that(smlc, is_equivalent_to(c("unique", "0.00", "0.01", "0.02", "0.03", "0.04", "0.05", "0.06", "0.07", "0.08", "0.09", "0.10")))
+})
+
+test_that("import_mothur: abundances can be manipulated mathematically", {
+ x1 <- as(otu_table(esophman), "matrix")
+ expect_that(2*x1-x1, is_equivalent_to(x1) )
+})
+
+test_that("import_mothur: empty stuff is NULL", {
+ expect_that(tax_table(esophman, FALSE), is_a("NULL"))
+ expect_that(sample_data(esophman, FALSE), is_a("NULL"))
+})
+
+test_that("import_mothur: Expected classes of non-empty components", {
+ expect_that(otu_table(esophman), is_a("otu_table"))
+ expect_that(phy_tree(esophman), is_a("phylo"))
+})
+
+test_that("import_mothur: imported files become S4 object", {
+ expect_that(isS4(esophman), is_true())
+})
+
+test_that("import_mothur: show method output tests", {
+ expect_output(print(esophman), "phyloseq-class experiment-level object")
+})
+
+test_that("Test newer supported mothur output files, constaxonomy and shared files", {
+ # Shared file
+ sharedOTU = import_mothur(mothur_shared_file=mothshared, cutoff=cutoff)
+ expect_is(sharedOTU, "otu_table")
+ expect_equivalent(as(sharedOTU[1:5, ], "matrix")[, "B"], c(50, 37, 14, 52, 2))
+ expect_equivalent(as(sharedOTU[1, ], "matrix")[1, ], c(50, 19, 5))
+ # Constaxonomy file
+ tax = import_mothur(mothur_constaxonomy_file=constaxonomy)
+ expect_is(tax, "taxonomyTable")
+})
+
+################################################################################
+# import_RDP tests
+test_that("the import_RDP_otu function can properly read gzipped-example", {
+ # Setup data
+ otufile <- system.file("extdata",
+ "rformat_dist_0.03.txt.gz",
+ package="phyloseq")
+ ex_otu <- import_RDP_otu(otufile)
+ # test expectations
+ expect_output(print(head(t(ex_otu))), "OTU Table:")
+ expect_is(ex_otu, "otu_table")
+ expect_equal(ntaxa(ex_otu), 5276)
+ expect_equal(nsamples(ex_otu), 14)
+ expect_is(sample_sums(ex_otu), "numeric")
+})
+
+
+################################################################################
+# import_qiime tests
+################################################################################
+otufile <- system.file("extdata", "GP_otu_table_rand_short.txt.gz", package="phyloseq")
+mapfile <- system.file("extdata", "master_map.txt", package="phyloseq")
+trefile <- system.file("extdata", "GP_tree_rand_short.newick.gz", package="phyloseq")
+rs_file <- system.file("extdata", "qiime500-refseq.fasta", package="phyloseq")
+
+t0 <- import_qiime(otufile, mapfile, trefile, rs_file, verbose=FALSE)
+test_that("Class of import result is phyloseq-class", {
+ expect_that(t0, is_a("phyloseq"))
+})
+
+test_that("Classes of components are as expected", {
+ expect_that(otu_table(t0), is_a("otu_table"))
+ expect_that(tax_table(t0), is_a("taxonomyTable"))
+ expect_that(sample_data(t0), is_a("sample_data"))
+ expect_that(phy_tree(t0), is_a("phylo"))
+ expect_that(refseq(t0), is_a("DNAStringSet"))
+})
+
+test_that("Features of the abundance data are consistent, match known values", {
+ expect_that(sum(taxa_sums(t0)), equals(1269671L))
+ expect_that(sum(taxa_sums(t0)==0), equals(5L))
+ expect_that(sum(taxa_sums(t0)>=100), equals(183L))
+ expect_that(sum(taxa_sums(t0)), equals(sum(sample_sums(t0))))
+ expect_that(sum(sample_sums(t0) > 10000L), equals(20L))
+ expect_that(nsamples(t0), equals(26L))
+ expect_that(ntaxa(t0), equals(500L))
+ expect_that(length(rank_names(t0)), equals(7L))
+})
+
+test_that("Features of the taxonomy table match expected values", {
+ expect_that(length(rank_names(t0)), equals(7L))
+ expect_equal(rank_names(t0),
+ c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"))
+ tax53 = as(tax_table(t0), "matrix")[53, ]
+ expect_that(tax53, is_equivalent_to(c("Bacteria", "Proteobacteria", "Deltaproteobacteria",
+ "Desulfovibrionales", "Desulfomicrobiaceae",
+ "Desulfomicrobium", "Desulfomicrobiumorale")))
+})
+################################################################################
+# parse function tests - note, these are also used by import_biom
+
+test_that("Taxonomy vector parsing functions behave as expected", {
+
+ chvec1 = c("Bacteria", "Proteobacteria", "Gammaproteobacteria",
+ "Enterobacteriales", "Enterobacteriaceae", "Escherichia")
+
+ chvec2 = c("k__Bacteria", "p__Proteobacteria", "c__Gammaproteobacteria",
+ "o__Enterobacteriales", "f__Enterobacteriaceae", "g__Escherichia", "s__")
+
+ chvec3 = c("Root", "k__Bacteria", "p__Firmicutes", "c__Bacilli",
+ "o__Bacillales", "f__Staphylococcaceae")
+
+ # Example where only some entries have greengenes prefix.
+ chvec4 = c("Root", "k__Bacteria", "Firmicutes", "c__Bacilli",
+ "o__Bacillales", "Staphylococcaceae", "z__mistake")
+
+ # Even more terrible example, where leading or trailing space characters included
+ # (the exact weirdnes of chvec4, compounded by leading and/or trailing space characters)
+ chvec5 = c(" Root \n ", " k__Bacteria", " Firmicutes", " c__Bacilli ",
+ "o__Bacillales ", "Staphylococcaceae ", "\t z__mistake \t\n")
+
+ # This should give a warning because there were no greengenes prefixes
+ expect_warning(t1 <- parse_taxonomy_greengenes(chvec1))
+ # And output from previous call, t1, should be identical to default
+ expect_that(parse_taxonomy_default(chvec1), is_equivalent_to(t1))
+
+ # All the greengenes entries get trimmed by parse_taxonomy_greengenes
+ expect_that(all(sapply(chvec2, nchar) > sapply(parse_taxonomy_greengenes(chvec2), nchar)), is_true())
+ # None of the greengenes entries are trimmed by parse_taxonomy_default
+ expect_that(any(sapply(chvec2, nchar) > sapply(parse_taxonomy_default(chvec2), nchar)), is_false())
+
+ # Check that the "Root" element is not removed by parse_taxonomy_greengenes and parse_taxonomy_default.
+ expect_that("Root" %in% chvec3, is_true())
+ expect_that("Root" %in% parse_taxonomy_default(chvec3), is_true())
+ expect_that(length(parse_taxonomy_default(chvec3)) == length(chvec3), is_true())
+
+ # Check that non-greengenes prefixes, and those w/o prefixes, are given dummy rank(s)
+ chvec4ranks = names(parse_taxonomy_greengenes(chvec4))
+ expect_that(grep("Rank", chvec4ranks, fixed=TRUE), is_equivalent_to(c(1, 3, 6, 7)))
+ # Check that everything given dummy rank in default parse.
+ chvec4ranks = names(parse_taxonomy_default(chvec4))
+ expect_that(grep("Rank", chvec4ranks, fixed=TRUE), is_equivalent_to(1:7))
+
+ # chvec4 and chvec5 result in identical vectors.
+ expect_that(parse_taxonomy_default(chvec4), is_equivalent_to(parse_taxonomy_default(chvec5)))
+ expect_that(parse_taxonomy_greengenes(chvec4), is_equivalent_to(parse_taxonomy_greengenes(chvec5)))
+
+ # The names of chvec5, greengenes parsed, should be...
+ correct5names = c("Rank1", "Kingdom", "Rank3", "Class", "Order", "Rank6", "Rank7")
+ expect_that(names(parse_taxonomy_greengenes(chvec5)), is_equivalent_to(correct5names))
+})
+
+################################################################################
+# import_biom tests
+
+rich_dense_biom <- system.file("extdata", "rich_dense_otu_table.biom", package="phyloseq")
+rich_sparse_biom <- system.file("extdata", "rich_sparse_otu_table.biom", package="phyloseq")
+min_dense_biom <- system.file("extdata", "min_dense_otu_table.biom", package="phyloseq")
+min_sparse_biom <- system.file("extdata", "min_sparse_otu_table.biom", package="phyloseq")
+# the tree and refseq file paths that are suitable for all biom format style examples
+treefilename = system.file("extdata", "biom-tree.phy", package="phyloseq")
+refseqfilename = system.file("extdata", "biom-refseq.fasta", package="phyloseq")
+
+test_that("Importing biom files yield phyloseq objects", {
+ library("biomformat")
+ rdbiom = read_biom(rich_sparse_biom)
+ rsbiom = read_biom(rich_sparse_biom)
+
+ rich_dense = import_biom(rdbiom)
+ rich_sparse = import_biom(rsbiom)
+
+ expect_that(rich_dense, is_a("phyloseq"))
+ expect_that(rich_sparse, is_a("phyloseq"))
+
+ expect_that(ntaxa(rich_dense), equals(5L))
+ expect_that(ntaxa(rich_sparse), equals(5L))
+
+ # # Component classes
+ # sample_data
+ expect_that(access(rich_dense, "sam_data"), is_a("sample_data"))
+ expect_that(access(rich_sparse, "sam_data"), is_a("sample_data"))
+
+ # taxonomyTable
+ expect_that(access(rich_dense, "tax_table"), is_a("taxonomyTable"))
+ expect_that(access(rich_sparse, "tax_table"), is_a("taxonomyTable"))
+
+ # otu_table
+ expect_that(access(rich_dense, "otu_table"), is_a("otu_table"))
+ expect_that(access(rich_sparse, "otu_table"), is_a("otu_table"))
+})
+
+test_that("The different types of biom files yield phyloseq objects",{
+ rich_dense = import_biom(rich_dense_biom, treefilename, refseqfilename, parseFunction=parse_taxonomy_greengenes)
+ rich_sparse = import_biom(rich_sparse_biom, treefilename, refseqfilename, parseFunction=parse_taxonomy_greengenes)
+ min_dense = import_biom(min_dense_biom, treefilename, refseqfilename, parseFunction=parse_taxonomy_greengenes)
+ min_sparse = import_biom(min_sparse_biom, treefilename, refseqfilename, parseFunction=parse_taxonomy_greengenes)
+
+ expect_that(rich_dense, is_a("phyloseq"))
+ expect_that(rich_sparse, is_a("phyloseq"))
+ expect_that(min_dense, is_a("phyloseq"))
+ expect_that(min_sparse, is_a("phyloseq"))
+
+ expect_that(ntaxa(rich_dense), equals(5L))
+ expect_that(ntaxa(rich_sparse), equals(5L))
+ expect_that(ntaxa(min_dense), equals(5L))
+ expect_that(ntaxa(min_sparse), equals(5L))
+
+ # # Component classes
+ # sample_data
+ expect_that(access(rich_dense, "sam_data"), is_a("sample_data"))
+ expect_that(access(rich_sparse, "sam_data"), is_a("sample_data"))
+ expect_that(access(min_dense, "sam_data"), is_a("NULL"))
+ expect_that(access(min_sparse, "sam_data"), is_a("NULL"))
+
+ # taxonomyTable
+ expect_that(access(rich_dense, "tax_table"), is_a("taxonomyTable"))
+ expect_that(access(rich_sparse, "tax_table"), is_a("taxonomyTable"))
+ expect_that(access(min_dense, "tax_table"), is_a("NULL"))
+ expect_that(access(min_sparse, "tax_table"), is_a("NULL"))
+
+ # phylo tree
+ expect_that(access(rich_dense, "phy_tree"), is_a("phylo"))
+ expect_that(access(rich_sparse, "phy_tree"), is_a("phylo"))
+ expect_that(access(min_dense, "phy_tree"), is_a("phylo"))
+ expect_that(access(min_sparse, "phy_tree"), is_a("phylo"))
+
+ # reference sequences
+ expect_that(inherits(access(rich_dense, "refseq"), "XStringSet"), is_true())
+ expect_that(inherits(access(rich_sparse, "refseq"), "XStringSet"), is_true())
+ expect_that(inherits(access(min_dense, "refseq"), "XStringSet"), is_true())
+ expect_that(inherits(access(min_sparse, "refseq"), "XStringSet"), is_true())
+ expect_that(access(rich_dense, "refseq"), is_a("DNAStringSet"))
+ expect_that(access(rich_sparse, "refseq"), is_a("DNAStringSet"))
+ expect_that(access(min_dense, "refseq"), is_a("DNAStringSet"))
+ expect_that(access(min_sparse, "refseq"), is_a("DNAStringSet"))
+
+ # otu_table
+ expect_that(access(rich_dense, "otu_table"), is_a("otu_table"))
+ expect_that(access(rich_sparse, "otu_table"), is_a("otu_table"))
+ expect_that(access(min_dense, "otu_table"), is_a("otu_table"))
+ expect_that(access(min_sparse, "otu_table"), is_a("otu_table"))
+
+ # Compare values in the otu_table. For some reason the otu_tables are not identical
+ # one position is plus-two, another is minus-two
+ combrich <- c(access(rich_dense, "otu_table"), access(rich_sparse, "otu_table"))
+ expect_that(sum(diff(combrich, length(access(rich_dense, "otu_table")))), is_equivalent_to(0))
+ expect_that(max(diff(combrich, length(access(rich_dense, "otu_table")))), is_equivalent_to(2))
+ expect_that(min(diff(combrich, length(access(rich_dense, "otu_table")))), is_equivalent_to(-2))
+ combmin <- c(access(min_dense, "otu_table"), access(min_sparse, "otu_table"))
+ expect_that(sum(diff(combmin, length(access(min_dense, "otu_table")))), is_equivalent_to(0))
+ expect_that(max(diff(combmin, length(access(min_dense, "otu_table")))), is_equivalent_to(2))
+ expect_that(min(diff(combmin, length(access(min_dense, "otu_table")))), is_equivalent_to(-2))
+
+ expect_that(access(min_dense, "otu_table"), is_equivalent_to(access(rich_dense, "otu_table")))
+ expect_that(access(min_sparse, "otu_table"), is_equivalent_to(access(rich_sparse, "otu_table")))
+
+ # Compare values in the sample_data
+ expect_that(access(rich_dense, "sam_data"), is_equivalent_to(access(rich_sparse, "sam_data")))
+
+ # Compare values in the taxonomyTable
+ expect_that(access(rich_dense, "tax_table"), is_equivalent_to(access(rich_sparse, "tax_table")))
+
+})
+
+test_that("the import_biom and import(\"biom\", ) syntax give same result", {
+ x1 <- import_biom(rich_dense_biom, parseFunction=parse_taxonomy_greengenes)
+ x2 <- import("biom", BIOMfilename=rich_dense_biom, parseFunction=parse_taxonomy_greengenes)
+ expect_that(x1, is_equivalent_to(x2))
+})
+################################################################################
+# read_tree tests
+test_that("The read_tree function works as expected:", {
+ GPNewick <- read_tree(system.file("extdata", "GP_tree_rand_short.newick.gz", package="phyloseq"))
+ expect_that(GPNewick, is_a("phylo"))
+ expect_that(ntaxa(GPNewick), equals(length(GPNewick$tip.label)))
+ expect_that(ntaxa(GPNewick), equals(500))
+ expect_that(GPNewick$Nnode, equals(499))
+ expect_that(taxa_names(GPNewick), is_equivalent_to(GPNewick$tip.label))
+ # Now read a nexus tree...
+ # Some error-handling expectations
+ expect_that(read_tree("alskflsakjsfskfhas.akshfaksj"), gives_warning()) # file not exist
+ not_tree <- system.file("extdata", "esophagus.good.groups.gz", package="phyloseq")
+ expect_that(read_tree(not_tree), is_a("NULL")) # file not a tree, gives NULL
+ expect_that(read_tree(not_tree, TRUE), throws_error()) # file not a tree, check turned on/TRUE
+})
+# read_tree_greengenes
+test_that("The specialized read_tree_greengenes function works:", {
+ # The included, gzipped version of the the 13_5 73% similarity greengenes tree.
+ # It causes ape::read.tree to fail with an error, but read_tree_greengenes should be fine.
+ treefile = system.file("extdata", "gg13-5-73.tree.gz", package="phyloseq")
+ x = read_tree_greengenes(treefile)
+ expect_is(x, "phylo")
+ # Happen to know that all OTU names should be numbers.
+ expect_match(x$tip.label, "[[:digit:]]+")
+ # All tip/OTU names should be unique
+ expect_false(any(duplicated(taxa_names(x))))
+ # The more general read_tree function should fail to read and return NULL
+ expect_is(read_tree(treefile), "NULL")
+})
+################################################################################
+# microbio_me_qiime tests
+# This tests different features and expected behavior for
+# the functioning of an interface function to the
+# microbio.me/qiime data repository.
+#
+zipfile = "study_816_split_library_seqs_and_mapping.zip"
+zipfile = system.file("extdata", zipfile, package="phyloseq")
+tarfile = "study_816_split_library_seqs_and_mapping.tar.gz"
+tarfile = system.file("extdata", tarfile, package="phyloseq")
+tarps = suppressWarnings(microbio_me_qiime(tarfile))
+zipps = suppressWarnings(microbio_me_qiime(zipfile))
+# This function is intended to interface with an external server,
+# as described in the documentation.
+# However, I don't want successful testing of this package to
+# rely on the presence or form of particular files on an
+# external server, so these tests will be done exclusively on
+# compressed file(s) representing what is exposed by the data server
+# It is up to the user to provide valid a URL in practice,
+# and the function attempts to provide informative status
+# and error messages if things go awry.
+test_that("The microbio_me_qiime imports as expected: .tar.gz", {
+ expect_is(tarps, "phyloseq")
+ expect_is(sample_data(tarps, errorIfNULL=FALSE), "sample_data")
+ expect_is(otu_table(tarps, errorIfNULL=FALSE), "otu_table")
+ expect_identical(nrow(otu_table(tarps)), 50L)
+ expect_identical(nrow(sample_data(tarps)), 15L)
+})
+test_that("The microbio_me_qiime imports as expected: .zip", {
+ expect_is(zipps, "phyloseq")
+ expect_is(sample_data(zipps, errorIfNULL=FALSE), "sample_data")
+ expect_is(otu_table(zipps, errorIfNULL=FALSE), "otu_table")
+ expect_identical(nrow(otu_table(zipps)), 50L)
+ expect_identical(nrow(sample_data(zipps)), 15L)
+})
+test_that("Results of .tar.gz and .zip should be identical", {
+ expect_identical(tarps, zipps)
+ expect_identical(sample_data(tarps), sample_data(zipps))
+ expect_identical(otu_table(tarps), otu_table(zipps))
+})
+################################################################################
+# import_usearch_uc
+################################################################################
+usearchfile = system.file("extdata", "usearch.uc", package="phyloseq")
+OTU1 = import_usearch_uc(usearchfile)
+test_that("import_usearch_uc: Properly omit entries from failed search", {
+ ucLines = readLines(usearchfile)
+ expect_identical( sum(OTU1), (length(ucLines) - length(grep("*", ucLines, fixed=TRUE))) )
+ expect_identical( nrow(OTU1), 37L)
+ expect_identical( nrow(OTU1), nsamples(OTU1))
+ expect_identical( ncol(OTU1), ntaxa(OTU1))
+ expect_identical( ncol(OTU1), 33L)
+ expect_equivalent(colSums(OTU1)[1:6], c(6, 1, 2, 1, 1, 1))
+})
+################################################################################
\ No newline at end of file
diff --git a/tests/testthat/test-distance.R b/tests/testthat/test-distance.R
new file mode 100644
index 0000000..3a80ec0
--- /dev/null
+++ b/tests/testthat/test-distance.R
@@ -0,0 +1,118 @@
+################################################################################
+# Use testthat to test that distance methods return correct results.
+################################################################################
+library("phyloseq"); packageVersion("phyloseq")
+library("testthat"); packageVersion("testthat")
+################################################################################
+# UniFrac testing section. Relies on pre-computed results from pycogent
+# The relevant python code is saved in `extdata/gp500-pycogent.py`
+################################################################################
+# Define the test-example phyloseq dataset. A random subsample from Global Patterns
+treeFile = system.file("extdata", "GP_tree_rand_short.newick.gz", package="phyloseq")
+GP500File = system.file("extdata", "GP_otu_table_rand_short.txt.gz", package = "phyloseq")
+GP500 = import_qiime(GP500File, treefilename = treeFile)
+# # Example if you want to re-create the test files for calculating with pyCogent
+#export_env_file(GP500, file = "~/Downloads/gp500test.env.txt", writeTree = FALSE)
+#ape::write.tree(phy_tree(GP500), file = "~/Downloads/gp500test.tree")
+# Now import the results with read.table()
+gp500_uuf = read.csv(system.file("extdata", "gp500-uuf.csv", package = "phyloseq"), header = FALSE, fill = TRUE)
+gp500_wuf = read.csv(system.file("extdata", "gp500-wuf.csv", package = "phyloseq"), header = FALSE, fill = TRUE)
+gp500_wufu = read.csv(system.file("extdata", "gp500-wufu.csv", package = "phyloseq"), header = FALSE, fill = TRUE)
+# Add the sample names
+colnames(gp500_uuf) <- rownames(gp500_uuf) <- colnames(gp500_wuf) <- rownames(gp500_wuf) <- colnames(gp500_wufu) <- rownames(gp500_wufu) <- sample_names(GP500)
+# Coerce to Distance Matrices for comparison `"dist"` class
+gp500_wufu <- as.dist(gp500_wufu)
+gp500_wuf <- as.dist(gp500_wuf)
+gp500_uuf <- as.dist(gp500_uuf)
+# Define numerical tolerance
+tol = 0.00000001
+test_that("UniFrac produces correct values on an example subset from Global Patterns. 'Correct' values are results from pyCogent", {
+ # Using UniFrac function directly
+ expect_equal(gp500_wufu, UniFrac(GP500, weighted = TRUE, normalized = FALSE), check.attributes = FALSE, tolerance = tol,
+ label = "`UniFrac`: Weighted but Un-normalized UniFrac results did not match reference answer.")
+ expect_equal(gp500_wuf, UniFrac(GP500, weighted = TRUE), check.attributes = FALSE, tolerance = tol,
+ label = "`UniFrac`: Weighted, normalized UniFrac results did not match reference answer.")
+ expect_equal(gp500_uuf, UniFrac(GP500, weighted = FALSE), check.attributes = FALSE, tolerance = tol,
+ label = "`UniFrac`: Unweighted UniFrac results did not match reference answer.")
+ # Using the `distance` wrapper
+ expect_equal(gp500_wufu, distance(GP500, "unifrac", weighted = TRUE, normalized = FALSE), check.attributes = FALSE, tolerance = tol,
+ label = "`distance`: Weighted but Un-normalized UniFrac results did not match reference answer.")
+ expect_equal(gp500_wuf, distance(GP500, "unifrac", weighted = TRUE), check.attributes = FALSE, tolerance = tol,
+ label = "`distance`: Weighted, normalized UniFrac results did not match reference answer.")
+ expect_equal(gp500_uuf, distance(GP500, "unifrac", weighted = FALSE), check.attributes = FALSE, tolerance = tol,
+ label = "`distance`: Unweighted UniFrac results did not match reference answer.")
+ # Make sure reference files are different (at the very least)
+ expect_false({isTRUE(all.equal(gp500_uuf, gp500_wuf, check.attributes = FALSE, tolerance = 0.01))},
+ label = "The reference matrices for UniFrac testing should be different, but were not. uuf/wuf")
+ expect_false({isTRUE(all.equal(gp500_uuf, gp500_wufu, check.attributes = FALSE, tolerance = 0.01))},
+ label = "The reference matrices for UniFrac testing should be different, but were not. uuf/wufu")
+ expect_identical(distance(GP500, "wunifrac"), distance(GP500, "unifrac", weighted = TRUE),
+ label = "wunifrac output is not identical to unifrac with weighted=T flag")
+})
+test_that("Check that regular-expression matching for unifrac method flag is working", {
+ expect_identical(distance(GP500, "w-UniFrac"), distance(GP500, "unifrac", weighted = TRUE))
+ expect_identical(distance(GP500, "weighted-UniFrac"), distance(GP500, "unifrac", weighted = TRUE))
+ expect_identical(distance(GP500, "unweighted-UniFrac"), distance(GP500, "unifrac"))
+ expect_identical(distance(GP500, "u-UniFrac"), distance(GP500, "unifrac"))
+})
+################################################################################
+# Test other distances against their expected dispatch explicit calculation
+################################################################################
+test_that("Test accurate dispatch for other distances", {
+ otumatgp500 = t(as(otu_table(GP500), "matrix"))
+ # Test for all vegdist, phyloseq object
+ expect_equal(
+ lapply(distanceMethodList$vegdist, vegan::vegdist, x = otumatgp500),
+ lapply(distanceMethodList$vegdist, distance, physeq = GP500),
+ check.attributes = FALSE, tolerance = tol)
+ # Test for all vegdist, OTU table
+ expect_equal(
+ lapply(distanceMethodList$vegdist, vegan::vegdist, x = otumatgp500),
+ lapply(distanceMethodList$vegdist, distance, physeq = otu_table(GP500)),
+ check.attributes = FALSE, tolerance = tol)
+ # Test for all betadiver, phyloseq object
+ expect_equal(
+ lapply(distanceMethodList$betadiver, vegan::betadiver, x = otumatgp500),
+ lapply(distanceMethodList$betadiver, distance, physeq = GP500),
+ check.attributes = FALSE, tolerance = tol)
+ # Test for all betadiver, OTU table
+ expect_equal(
+ lapply(distanceMethodList$betadiver, vegan::betadiver, x = otumatgp500),
+ lapply(distanceMethodList$betadiver, distance, physeq = otu_table(GP500)),
+ check.attributes = FALSE, tolerance = tol)
+ # Test for all dist, phyloseq object
+ expect_equal(
+ lapply(distanceMethodList$dist, stats::dist, x = otumatgp500),
+ lapply(distanceMethodList$dist, distance, physeq = GP500),
+ check.attributes = FALSE, tolerance = tol)
+ # Test for all dist, OTU table
+ expect_equal(
+ lapply(distanceMethodList$dist, stats::dist, x = otumatgp500),
+ lapply(distanceMethodList$dist, distance, physeq = otu_table(GP500)),
+ check.attributes = FALSE, tolerance = tol)
+
+ # DPCoA
+ #"dpcoa"
+ # phyloseq object
+ # TOO SLOW to test routinely. Commenting out.
+ # expect_equal(
+ # as.dist(DPCoA(GP500)$RaoDis, diag=FALSE),
+ # distance(GP500, "dpcoa"),
+ # check.attributes = FALSE, tolerance = tol)
+ # OTU table doesn't have a tree
+ expect_error(DPCoA(otu_table(GP500)))
+
+ # JSD
+ #"jsd"
+ # phyloseq object
+ expect_equal(
+ phyloseq:::JSD(GP500),
+ distance(GP500, "jsd"),
+ check.attributes = FALSE, tolerance = tol)
+ # OTU table
+ expect_equal(
+ phyloseq:::JSD(otu_table(GP500)),
+ distance(otu_table(GP500), "jsd"),
+ check.attributes = FALSE, tolerance = tol)
+})
+################################################################################
diff --git a/tests/testthat/test-merge.R b/tests/testthat/test-merge.R
new file mode 100644
index 0000000..5f2cf34
--- /dev/null
+++ b/tests/testthat/test-merge.R
@@ -0,0 +1,305 @@
+# testthat tests don't do anything when successful.
+library("phyloseq"); packageVersion("phyloseq")
+library("testthat"); packageVersion("testthat")
+
+
+# # # Tests!
+
+################################################################################
+# merge_samples
+data(GlobalPatterns)
+# GP <- prune_taxa(taxa_sums(GlobalPatterns)>0, GlobalPatterns)
+GP <- GlobalPatterns
+mGP <- merge_samples(GlobalPatterns, "SampleType")
+
+test_that("Classes of merged phyloseq objects are as expected", {
+ expect_that(merge_samples(otu_table(GP), get_variable(GP, "SampleType")), is_a("otu_table"))
+ expect_that(merge_samples(sample_data(GP), "SampleType"), is_a("sample_data"))
+ expect_that(mGP, is_a("phyloseq"))
+})
+
+test_that("Same sam_data result for separate and combined merge in merge_samples", {
+ expect_that(
+ merge_samples(sample_data(GP), "SampleType"),
+ is_identical_to(sample_data(mGP))
+ )
+})
+
+test_that("Same otu_table result for separate and combined merge in merge_samples", {
+ expect_that(
+ merge_samples(otu_table(GP), get_variable(GP, "SampleType")),
+ is_identical_to(otu_table(mGP))
+ )
+})
+
+test_that("Sample Names of merged object now same set as merging factor levels", {
+ sampleTypes = levels(data.frame(sample_data(GP))$SampleType)
+ expect_that(setdiff(sampleTypes, sample_names(mGP)), is_identical_to(character()))
+})
+
+test_that("Counts from merged-samples are summed...", {
+ OTUnames10 = names(sort(taxa_sums(GP), TRUE)[1:10])
+ GP10 = prune_taxa(OTUnames10, GP)
+ mGP10 = prune_taxa(OTUnames10, mGP)
+ # Loop to check the correct summation has occured for all OTUs.
+ for( i in OTUnames10 ){
+ isum = as(tapply(get_sample(GP10, i), get_variable(GP10, "SampleType"), sum), "numeric")
+ expect_that(isum, is_equivalent_to(get_sample(mGP10, i)))
+ }
+})
+
+################################################################################
+# merge_phyloseq
+test_that("merge_phyloseq: Break apart GP based on human-association,
+ then merge back together.", {
+ data(GlobalPatterns)
+ GP <- prune_taxa(taxa_names(GlobalPatterns)[1:100], GlobalPatterns)
+ sample_data(GP)$human <- factor(get_variable(GP, "SampleType") %in% c("Feces", "Mock", "Skin", "Tongue"))
+ h1 <- subset_samples(GP, human=="TRUE")
+ h0 <- subset_samples(GP, human=="FALSE")
+ GP1 <- merge_phyloseq(h0, h1)
+
+ # The species order is fixed to the tree,
+ # so should be the same between the original and merged
+ expect_that(taxa_names(GP), is_identical_to(taxa_names(GP1)))
+ expect_that(phy_tree(h1), is_identical_to(phy_tree(h0)))
+
+ # However, the sample order has been shuffled by the split/merge.
+ # Fix the sample order by re-ordering the otu_table, and reassigning
+ sa.order <- sample_names(GP)
+ sa.order <- sa.order[sa.order %in% sample_names(GP1)]
+ otu_table(GP1) <- otu_table(GP1)[, sa.order]
+ expect_equal(sample_names(GP), sample_names(GP1))
+ expect_equal(sample_names(sample_data(GP)), sample_names(sample_data(GP1)))
+ # Sample data entries are the same, irrespective of factor levels
+ GPfactors = which(sapply(sample_data(GP1), inherits, "factor"))
+ for(j in GPfactors){
+ expect_equal(as.character(get_variable(GP, j)),
+ as.character(get_variable(GP1, j)))
+ }
+ # Reconcile factor level order for remaining tests
+ GP1factors = which(sapply(sample_data(GP1), inherits, "factor"))
+ for(j in names(GP1factors)){
+ varj = as.character(get_variable(GP1, j))
+ sample_data(GP1)[, j] <- factor(varj, levels = sort(unique(varj)))
+ }
+ GPfactors = which(sapply(sample_data(GP), inherits, "factor"))
+ for(j in names(GPfactors)){
+ varj = as.character(get_variable(GP, j))
+ sample_data(GP)[, j] <- factor(varj, levels = sort(unique(varj)))
+ }
+ # Check a specific variable
+ expect_equal(sample_data(GP1)$SampleType,
+ sample_data(GP)$SampleType)
+ # Should be fixed now. Full object and components now identical
+ expect_equal(GP1, GP)
+ expect_identical(GP1, GP)
+ expect_that(otu_table(GP1), is_identical_to(otu_table(GP)))
+ expect_that(tax_table(GP1), is_identical_to(tax_table(GP)))
+ expect_that(phy_tree(GP1), is_identical_to(phy_tree(GP)))
+
+ ## Check factor levels
+ # The set
+ expect_identical(sort(levels(sample_data(GP1)$SampleType)),
+ sort(levels(sample_data(GP)$SampleType)))
+ # The order
+ expect_identical(levels(sample_data(GP1)$SampleType),
+ levels(sample_data(GP)$SampleType))
+ # Overall
+ expect_identical(sample_data(GP1), sample_data(GP))
+ expect_identical(droplevels(sample_data(GP1)), droplevels(sample_data(GP)))
+
+ # Check variable names are all there (set)
+ expect_equal(
+ object = sort(intersect(colnames(sample_data(GP1)), colnames(sample_data(GP)))),
+ expected = sort(colnames(sample_data(GP1))))
+ # Check column classes
+ expect_equal(sapply(sample_data(GP1), class), sapply(sample_data(GP), class))
+ # Check column names
+ expect_equal(colnames(sample_data(GP1)), colnames(sample_data(GP)))
+ # Check sample name order
+ expect_equal(sample_names(sample_data(GP1)), sample_names(sample_data(GP)))
+ expect_equal(sample_names(GP1), sample_names(GP))
+})
+
+################################################################################
+# tax_glom
+# Load data
+data("GlobalPatterns")
+GP.chl = subset_taxa(GlobalPatterns, Phylum == "Chlamydiae")
+test_that("the tax_table slot is identical whether tax_glom()ed by itself or as component", {
+ expect_that(tax_glom(tax_table(GP.chl), "Family"), is_a("taxonomyTable"))
+ expect_that(n1<-tax_glom(GP.chl, "Family"), is_a("phyloseq"))
+ expect_that(ntaxa(n1), equals(4L))
+ expect_that(
+ tax_glom(tax_table(GP.chl), taxrank="Family"),
+ is_equivalent_to(tax_table(tax_glom(GP.chl, taxrank="Family")))
+ )
+ n1 = as(tax_glom(tax_table(GP.chl), taxrank="Family", NArm=FALSE), "matrix")[, "Family"]
+ n2 = tax_glom(GP.chl, taxrank="Family", NArm=FALSE)
+ expect_true(setequal(n1, as(tax_table(n2), "matrix")[, "Family"]))
+ expect_that(ntaxa(n2), equals(5L))
+})
+test_that("tax_glom() handles clearly agglomeration to one taxa", {
+ expect_that(n1 <- tax_glom(GP.chl, "Phylum"), gives_warning())
+ expect_that(n1, is_a("phyloseq"))
+ expect_that(ntaxa(n1), equals(1L))
+ expect_that(access(n1, "phy_tree"), is_a("NULL"))
+})
+test_that("tax_glom() can handle even the highest rank glom", {
+ expect_warning(tax_glom(GP.chl, "Kingdom"))
+ gpk = tax_glom(GlobalPatterns, "Kingdom")
+ expect_is(gpk, "phyloseq")
+ expect_equivalent(ntaxa(gpk), 2)
+ expect_equivalent(taxa_sums(gpk), c(195598, 28021080))
+})
+################################################################################
+# prune_taxa
+# Use the GP.chl dataset from previous testing block
+test_that("prune_taxa() handles clearly pruning to one taxa", {
+ # throws warning, and NULL-tre
+ expect_that(n1 <- prune_taxa(taxa_names(GP.chl)[1:1], GP.chl), gives_warning())
+ expect_that(ntaxa(n1), equals(1L))
+ expect_that(n1, is_a("phyloseq"))
+ expect_that(access(n1, "phy_tree"), is_a("NULL"))
+ expect_that(access(n1, "otu_table"), is_a("otu_table"))
+})
+test_that("prune_taxa() properly handles standard-cases", {
+ # throws warning, and NULL-tre
+ expect_that(n1 <- prune_taxa(taxa_names(GP.chl)[1:5], GP.chl), is_a("phyloseq"))
+ expect_that(ntaxa(n1), equals(5L))
+ expect_that(access(n1, "phy_tree"), is_a("phylo"))
+ expect_that(access(n1, "otu_table"), is_a("otu_table"))
+ expect_that(access(n1, "sam_data"), is_a("sample_data"))
+ expect_that(access(n1, "tax_table"), is_a("taxonomyTable"))
+ # Use logical vector, and get same answer
+ L2 <- vector(length=ntaxa(GP.chl))
+ L2[1:5] <- TRUE
+ expect_that(n2 <- prune_taxa(L2, GP.chl), is_a("phyloseq"))
+ expect_that(n2, is_identical_to(n1))
+})
+################################################################################
+# merge_taxa
+# Use the GP.chl dataset from previous testing block
+test_that("merge_taxa() properly handles standard-cases", {
+ expect_that(n1 <- merge_taxa(GP.chl, c("24341", "579085")), is_a("phyloseq"))
+ expect_that(ntaxa(n1), equals(20L))
+ # The first name is kept, others removed
+ expect_that("579085" %in% taxa_names(n1), equals(FALSE))
+ expect_that("24341" %in% taxa_names(n1), equals(TRUE))
+ # Try a 3-element merge, check that the largest-count remains.
+ OTUIDs = c("579085", "24341", "547579")
+ biggestOTU = names(which.max(taxa_sums(GP.chl)[OTUIDs]))
+ # Perform the merge of `OTUIDs`, and check the resulting class while at it.
+ expect_is(n2 <- merge_taxa(GP.chl, OTUIDs), "phyloseq")
+ # Check that there are now the correct, fewer number of OTUs
+ expect_equal(ntaxa(n2), (ntaxa(GP.chl)-length(OTUIDs)+1))
+ # The biggest OTU is kept, others merged
+ expect_true(biggestOTU %in% taxa_names(n2))
+ expect_true(!any(setdiff(OTUIDs, biggestOTU) %in% taxa_names(n2)))
+ # Merge again, but only use the tax_table. No counts changes default retained to first in vector
+ expect_is(n2b <- merge_taxa(tax_table(GP.chl), OTUIDs), "taxonomyTable")
+ # Check that there are now the correct, fewer number of OTUs
+ expect_equal(ntaxa(n2b), (ntaxa(GP.chl)-length(OTUIDs)+1))
+ # The biggest OTU is kept, others merged
+ expect_true(OTUIDs[1] %in% taxa_names(n2b))
+ expect_true(!any(setdiff(OTUIDs, OTUIDs[1]) %in% taxa_names(n2b)))
+ # Merge again, but specify the retained OTU name as the 3rd one, rather than the default
+ expect_that(n3 <- merge_taxa(GP.chl, eqtaxa=OTUIDs, archetype=OTUIDs[3]), is_a("phyloseq"))
+ # "547579" is kept, others removed
+ expect_true(OTUIDs[3] %in% taxa_names(n3))
+ expect_true(!any(setdiff(OTUIDs, OTUIDs[3]) %in% taxa_names(n3)))
+ # Check that the remaining OTU has the sum of the values merged
+ expect_identical(get_sample(n3, OTUIDs[3]),
+ colSums(as(otu_table(GP.chl), "matrix")[OTUIDs, ]))
+})
+test_that("merge_taxa() replaces disagreements in taxonomy with NA", {
+ # Try a more difficult merge from a different subset
+ GP20 <- prune_taxa(taxa_names(GlobalPatterns)[1:20], GlobalPatterns)
+ # Arbitrary merge into taxa "951", NA in ranks after Phylum
+ OTUIDs = c("951", "586076", "141782", "30678", "30405")
+ biggestOTU = names(which.max(taxa_sums(GP20)[OTUIDs]))
+ n5 = merge_taxa(GP20, OTUIDs)
+ # The biggest OTU is kept, others merged
+ expect_true(biggestOTU %in% taxa_names(n5))
+ expect_true(!any(setdiff(OTUIDs, biggestOTU) %in% taxa_names(n5)))
+ # The taxonomy should be NA_character_ after Phylum (OTUIDs chosen carefully in this case)
+ n5_merged_taxonomy <- as(tax_table(n5), "matrix")[biggestOTU, ]
+ expect_true(!any(is.na(n5_merged_taxonomy[1:2])))
+ expect_true(all(is.na(n5_merged_taxonomy[3:7])))
+ # Test how well it works at a different level (say first or last ranks)
+ OTUIDs <- c("1126", "31759")
+ biggestOTU = names(which.max(taxa_sums(GP20)[OTUIDs]))
+ n6 <- merge_taxa(GP20, OTUIDs)
+ # The biggest OTU is kept, others merged
+ expect_true(biggestOTU %in% taxa_names(n6))
+ expect_true(!any(setdiff(OTUIDs, biggestOTU) %in% taxa_names(n6)))
+ # Test that the taxonomy is NA after Order
+ n6_merged_taxonomy <- as(tax_table(n6), "matrix")[biggestOTU, ]
+ expect_true( !any(is.na(n6_merged_taxonomy[1:4])) )
+ expect_true( all(is.na(n6_merged_taxonomy[5:7])) )
+ # Test that it works for differences at the first rank
+ GP20f <- GP20
+ tax_table(GP20f)[1, 1] <- "Bacteria"
+ OTUIDs = taxa_names(GP20f)[1:2]
+ biggestOTU = names(which.max(taxa_sums(GP20f)[OTUIDs]))
+ expect_is(n7 <- merge_taxa(GP20f, OTUIDs), "phyloseq")
+ # Should be all NA taxonomy
+ expect_that( all(is.na(as(tax_table(n7), "matrix")[biggestOTU, ])), equals(TRUE))
+ # Test that it works for differences at the last rank
+ # First, make the first taxa the same as "951"
+ tax_table(GP20f)[1, ] <- tax_table(GP20f)["951", ]
+ # Now change the last rank of this entry to something else
+ tax_table(GP20f)[1, length(rank_names(GP20f))] <- "species_phyloseq_test"
+ OTUIDs = c("951", biggestOTU)
+ biggestOTU = names(which.max(taxa_sums(GP20f)[OTUIDs]))
+ expect_is(n8 <- merge_taxa(GP20f, OTUIDs), "phyloseq")
+ t951 <- as(tax_table(n8), "matrix")[biggestOTU, ]
+ expect_equal( sum(is.na(t951)), 1L )
+ expect_true( is.na(t951[length(rank_names(n8))]) )
+ expect_identical( t951[-7], as(tax_table(GP20f), "matrix")["951", ][-7] )
+ # Test that it works if the taxonomies completely agree
+ GP20f <- GP20
+ # Make the first taxa the same as "951"
+ tax_table(GP20f)[1, ] <- tax_table(GP20f)["951", ]
+ merge_these <- c("549322", "951")
+ n9 <- merge_taxa(GP20f, merge_these)
+ n9t1 <- as(tax_table(n9), "matrix")["549322", ]
+ # None should be NA
+ expect_that(any(is.na(n9t1)), equals(FALSE))
+ expect_that(length(n9t1), equals(7L))
+ # Merge worked, "951" is gone.
+ expect_that("951" %in% taxa_names(n9), equals(FALSE))
+})
+test_that("merge_taxa() properly handles different types and orders of taxa specified by the eqtaxa and archetype arguments, and also handles refseq data", {
+ # Test merge_taxa on data with a reference sequence file.
+ otufile <- system.file("extdata", "GP_otu_table_rand_short.txt.gz", package="phyloseq")
+ mapfile <- system.file("extdata", "master_map.txt", package="phyloseq")
+ trefile <- system.file("extdata", "GP_tree_rand_short.newick.gz", package="phyloseq")
+ rs_file <- system.file("extdata", "qiime500-refseq.fasta", package="phyloseq")
+ rs0 <- import_qiime(otufile, mapfile, trefile, rs_file)
+ rs1 = merge_taxa(rs0, c("71074", "10517", "8096"))
+ rs2 = merge_taxa(rs0, c("71074", "8096", "10517"), "71074")
+ rs3 = merge_taxa(rs0, c("71074", "10517", "8096"), 3)
+ rs4 = merge_taxa(rs0, c("8096", "71074", "10517"))
+ # rs1 and rs2 should be identical
+ # rs3 and rs4 should be identical
+ expect_equivalent(rs1, rs2)
+ expect_true(!identical(rs1, rs3))
+ expect_equivalent(rs3, rs4)
+ # double-check that components are all there
+ expect_that(length(getslots.phyloseq(rs1)), equals(5L))
+ expect_that(length(getslots.phyloseq(rs2)), equals(5L))
+ expect_that(length(getslots.phyloseq(rs3)), equals(5L))
+ expect_that(length(getslots.phyloseq(rs4)), equals(5L))
+ # The number of taxa should be the same as the original less two
+ expect_that(ntaxa(rs1), equals(ntaxa(rs0)-2L))
+ expect_that(ntaxa(rs2), equals(ntaxa(rs0)-2L))
+ expect_that(ntaxa(rs3), equals(ntaxa(rs0)-2L))
+ expect_that(ntaxa(rs4), equals(ntaxa(rs0)-2L))
+ # merge_taxa() errors when a bad archetype is provided
+ # Throws error because keepIndex is NULL
+ expect_that(merge_taxa(rs0, c("71074", "10517", "8096"), "wtf"), throws_error())
+ # Throws error because keepIndex is not part of eqtaxa (logic error, invalid merge)
+ expect_that(merge_taxa(rs0, c("71074", "10517", "8096"), "13662"), throws_error())
+})
diff --git a/tests/testthat/test-phyloseq.R b/tests/testthat/test-phyloseq.R
new file mode 100644
index 0000000..96c894f
--- /dev/null
+++ b/tests/testthat/test-phyloseq.R
@@ -0,0 +1,140 @@
+################################################################################
+# Use testthat to test phyloseq constructor and other class internals.
+################################################################################
+library("phyloseq"); library("testthat")
+# # # # TESTS!
+set.seed(8888)
+
+################################################################################
+test_that("phyloseq: Building a phyloseq-object when tree contains extra quotes, still works.", {
+ data("esophagus")
+ tree = phy_tree(esophagus)
+ # Add extra quotes surrounding each OTU name in the tree
+ tree$tip.label = paste("\"", taxa_names(tree), "\"", sep="")
+ # Try to add the tree back to esophagus, replacing the original
+ # (Should work with message.)
+ esophagus1 = esophagus
+ phy_tree(esophagus) = tree
+ expect_that(esophagus1, is_identical_to(esophagus))
+ # Now try to "rebuild" using the quote-containing
+ esophagus2 = phyloseq(tree, otu_table(esophagus))
+ expect_that(esophagus2, is_identical_to(esophagus))
+
+ # Try with a dataset with complete-set of components, Global Patterns
+ data("GlobalPatterns")
+ # Use a subset, because checking identicality of two large, complicated objects takes time.
+ minsum = sort(taxa_sums(GlobalPatterns), TRUE)[20]
+ GP = prune_taxa(taxa_sums(GlobalPatterns) >= minsum, GlobalPatterns)
+ tree = phy_tree(GP)
+ # Add extra quotes surrounding each OTU name in the tree
+ tree$tip.label = paste("\"", taxa_names(tree), "\"", sep="")
+ # Try to add the tree back to GP, replacing the original
+ # (Should work with message.)
+ GP1 = GP
+ phy_tree(GP) = tree
+ expect_that(GP1, is_identical_to(GP))
+ # Now try to "rebuild" using the quote-containing
+ GP2 = phyloseq(tree, otu_table(GP), tax_table(GP), sample_data(GP))
+ expect_that(GP2, is_identical_to(GP))
+})
+################################################################################
+# More constructor-related tests needed here...
+
+################################################################################
+# - Test that re-assigning taxa_names and sample_names works.
+# - Use this to test that intersect_samples and intersect_taxa works.
+################################################################################
+data("GlobalPatterns")
+e1 = prune_taxa(taxa_names(GlobalPatterns)[1:25], GlobalPatterns)
+
+test_that("taxa_names(x)<- and sample_names(x)<- behaves as expected", {
+ # taxa_names<-
+ new_taxa_names = paste("OTU-", taxa_names(e1), sep="")
+ taxa_names(e1) = new_taxa_names
+ expect_that(identical(taxa_names(e1), new_taxa_names), is_true())
+ expect_that(identical(taxa_names(phy_tree(e1)), new_taxa_names), is_true())
+ expect_that(identical(taxa_names(otu_table(e1)), new_taxa_names), is_true())
+ expect_that(identical(taxa_names(tax_table(e1)), new_taxa_names), is_true())
+# expect_that(identical(taxa_names(refseq(e1)), new_taxa_names), is_true())
+
+ # sample_names<-
+ new_sample_names = paste("Sa-", sample_names(e1), sep="")
+ sample_names(e1) = new_sample_names
+ expect_that(identical(sample_names(e1), new_sample_names), is_true())
+ expect_that(identical(sample_names(sample_data(e1)), new_sample_names), is_true())
+ expect_that(identical(sample_names(otu_table(e1)), new_sample_names), is_true())
+})
+test_that("Test intersect_*() and prune_*() methods behave as expected", {
+ e0 = e1
+ # taxa_names<-
+ ## We assign new names to just one component, being sneaky and using the
+ ## not-recommended direct replacement with @slotname
+ ## This should work, but users should not do it in normal circumstances
+ new_taxa_names = taxa_names(e1)
+ nunchained = 5L
+ i = sample(ntaxa(e1), ntaxa(e1)-nunchained, replace=FALSE)
+ new_taxa_names[i] = paste("OTU-", taxa_names(e1)[i], sep="")
+ taxa_names(e1 at tax_table) = new_taxa_names
+ expect_that(identical(taxa_names(e1), new_taxa_names), is_false())
+ expect_that(identical(taxa_names(tax_table(e1)), new_taxa_names), is_true())
+ expect_that(identical(taxa_names(otu_table(e1)), new_taxa_names), is_false())
+ expect_that(identical(taxa_names(phy_tree(e1)), new_taxa_names), is_false())
+# expect_that(identical(taxa_names(refseq(e1)), new_taxa_names), is_false())
+ ## Okay so that worked. Now we test if the intersection functions behave
+ expect_that(identical(length(phyloseq:::intersect_taxa(e1)), nunchained), is_true())
+ e2 = prune_taxa(phyloseq:::intersect_taxa(e1), e1)
+ expect_that(identical(ntaxa(e2), nunchained), is_true())
+ expect_that(setequal(taxa_names(e2), taxa_names(e1)[-i]), is_true())
+
+ # sample_names<-
+ e1 = e0
+ ## We assign new names to just one component, being sneaky and using the
+ ## not-recommended direct replacement with @slotname
+ ## This should work, but users should not do it in normal circumstances
+ new_sample_names = sample_names(e1)
+ nunchained = 5L
+ i = sample(nsamples(e1), nsamples(e1)-nunchained, replace=FALSE)
+ new_sample_names[i] = paste("Sa-", sample_names(e1)[i], sep="")
+ sample_names(e1 at sam_data) = new_sample_names
+ expect_that(identical(sample_names(e1), new_sample_names), is_false())
+ expect_that(identical(sample_names(sample_data(e1)), new_sample_names), is_true())
+ expect_that(identical(sample_names(otu_table(e1)), new_sample_names), is_false())
+
+ ## Okay so that worked. Now we test if the intersection functions behave
+ expect_that(identical(length(phyloseq:::intersect_samples(e1)), nunchained), is_true())
+ e2 = prune_samples(phyloseq:::intersect_samples(e1), e1)
+ expect_that(identical(nsamples(e2), nunchained), is_true())
+ expect_that(setequal(sample_names(e2), sample_names(e1)[-i]), is_true())
+
+})
+
+test_that("Test ordering", {
+ OTU = otu_table(e1)
+ tree = phy_tree(e1)
+ expect_that(identical(taxa_names(OTU), taxa_names(tree)), is_true())
+ reotaxnames = sample(taxa_names(tree), ntaxa(tree), FALSE)
+ expect_that(identical(taxa_names(OTU), reotaxnames), is_false())
+ # scramble order of taxa_names in tree by random arbitrary assignment
+ taxa_names(tree) <- reotaxnames
+ expect_that(identical(taxa_names(OTU), taxa_names(tree)), is_false())
+ # implicitly re-order in constructor
+ e3 = phyloseq(OTU, tree)
+ expect_that(identical(taxa_names(e3), taxa_names(tree)), is_true())
+ expect_that(identical(taxa_names(otu_table(e3)), taxa_names(phy_tree(e3))), is_true())
+ expect_that(identical(taxa_names(otu_table(e3)), taxa_names(phy_tree(e3))), is_true())
+
+ # Glad that worked. Now let's mess up sample indices in one component, and OTU indices in another
+ # then fix explicitly with index_reorder(e4, "both")
+ e4 = e1
+ reosamplenames = sample(sample_names(e1), nsamples(e1), FALSE)
+ sample_names(e4 at sam_data) <- reosamplenames
+ taxa_names(e4 at tax_table) <- reotaxnames
+
+ expect_that(identical(taxa_names(otu_table(e4)), taxa_names(tax_table(e4))), is_false())
+ expect_that(identical(sample_names(otu_table(e4)), sample_names(sample_data(e4))), is_false())
+ e4 = phyloseq:::index_reorder(e4, "both")
+ expect_that(identical(taxa_names(otu_table(e4)), taxa_names(tax_table(e4))), is_true())
+ expect_that(identical(sample_names(otu_table(e4)), sample_names(sample_data(e4))), is_true())
+})
+
+################################################################################
diff --git a/tests/testthat/test-plot.R b/tests/testthat/test-plot.R
new file mode 100644
index 0000000..e5c9064
--- /dev/null
+++ b/tests/testthat/test-plot.R
@@ -0,0 +1,500 @@
+################################################################################
+# plot_ordination unit tests
+################################################################################
+library("phyloseq"); library("testthat"); library("ggplot2")
+data("GlobalPatterns")
+# Subset to small dataset for quicker testing
+GP <- prune_taxa(tail(names(sort(taxa_sums(GlobalPatterns))), 50), GlobalPatterns)
+
+# Pretend GP doesn't have sample_data or tax_table
+GP.tax <- tax_table(GP)
+GP.sd <- sample_data(GP)
+GP.tr <- phy_tree(GP)
+# GP <- phyloseq(otu_table(GP), GP.tr)
+GP.otu <- otu_table(GP)
+
+# Try ordination
+GP.ord <- ordinate(GP.otu, "DCA")
+
+# test_that encapsulation makes it difficult to fully test the formula / get()
+# step in the formula conversion, but that deprecated workaround is
+# still included and will hopefully bridge the gap for users switching
+# from previous formula-first use-cases, where the left-hand side of
+# the formula specified the phyloseq-data
+#test_that("plot_ordination: formula-first should give a deprecation warning", {
+# expect_warning(GP.ord.cap <- ordinate(GP~SampleType, "CAP"))
+# expect_warning(GP.ord.cca <- ordinate(GP~SampleType, "CCA"))
+# expect_warning(GP.ord.rda <- ordinate(GP~SampleType, "RDA"))
+# # But it still works.
+# expect_is(GP.ord.cap, "capscale")
+# expect_is(GP.ord.cca, "cca")
+# expect_is(GP.ord.rda, "rda")
+
+
+test_that("plot_ordination: Naked otu_table results in warning, but no error", {
+ expect_is(GP.ord, "decorana")
+ # samples-only
+ expect_that(plot_ordination(GP.otu, GP.ord, "samples"), gives_warning())
+ # species.
+ expect_that(plot_ordination(GP.otu, GP.ord, "species"), gives_warning())
+ # split
+ expect_that(plot_ordination(GP.otu, GP.ord, "split"), gives_warning())
+ # biplot
+ expect_that(plot_ordination(GP.otu, GP.ord, "biplot"), gives_warning())
+})
+
+# Create (merged) phyloseq-class GP, and run comparisons
+test_that("all 4 plot_ordination type options result in valid ggplot2 object", {
+ GP <- merge_phyloseq(GP.otu, GP.tr)
+ # Print. Don't want the render directive to have an error,
+ # even while the ggplot object is created.
+ expect_that(print(plot_ordination(GP, GP.ord, "samples")), is_a("list"))
+ expect_that(print(plot_ordination(GP, GP.ord, "species")), is_a("list"))
+ expect_that(print(plot_ordination(GP, GP.ord, "split")), is_a("list"))
+ expect_that(print(plot_ordination(GP, GP.ord, "biplot")), is_a("list"))
+ # Don't print. Test that result is ggplot-class
+ expect_that(plot_ordination(GP, GP.ord, "samples"), is_a("ggplot"))
+ expect_that(plot_ordination(GP, GP.ord, "species"), is_a("ggplot"))
+ expect_that(plot_ordination(GP, GP.ord, "split"), is_a("ggplot"))
+ expect_that(plot_ordination(GP, GP.ord, "biplot"), is_a("ggplot"))
+})
+
+test_that("plot_ordination: The justDF=TRUE option returns a data.frame", {
+ # Make GP a phyloseq object with only tree (no ordination co-variables to plot)
+ GP <- merge_phyloseq(GP.otu, GP.tr)
+ expect_that(df0 <- plot_ordination(GP, GP.ord, "species", justDF=TRUE), is_a("data.frame"))
+ expect_that(df1 <- plot_ordination(GP, GP.ord, "samples", justDF=TRUE), is_a("data.frame"))
+ expect_that(df2 <- plot_ordination(GP, GP.ord, "split", justDF=TRUE), is_a("data.frame"))
+ expect_that(df3 <- plot_ordination(GP, GP.ord, "biplot", justDF=TRUE), is_a("data.frame"))
+ # split and biplot data.frames should be same.
+ expect_that(df2, is_identical_to(df3))
+})
+
+test_that("plot_ordination: When variables are present or not, color SampleType", {
+ p1 <- plot_ordination(GP, GP.ord, "samples", color="SampleType")
+ expect_that(p2<-plot_ordination(GP, GP.ord, "species", color="SampleType"), gives_warning())
+ p3 <- plot_ordination(GP, GP.ord, "split", color="SampleType")
+ p4 <- plot_ordination(GP, GP.ord, "biplot", color="SampleType")
+ # ggplot-class tests
+ expect_is(p1, "ggplot")
+ expect_is(p2, "ggplot")
+ expect_is(p3, "ggplot")
+ expect_is(p4, "ggplot")
+ expect_is(print(p1), "list")
+ expect_is(print(p2), "list")
+ expect_is(print(p3), "list")
+ expect_is(print(p4), "list")
+})
+
+
+test_that("plot_ordination: When variables are present or not, shape SamplyType", {
+ # GP <- merge_phyloseq(GP.otu, GP.tr, GP.sd, GP.tax)
+ # Pair down samples to just five sampleTypes, for shape plotting.
+ GP <- subset_samples(GP, SampleType %in% c("Feces", "Freshwater", "Ocean",
+ "Tongue", "Sediment (estuary)"))
+ # Some legend issues here that need tidying...
+ p1 <- plot_ordination(GP, GP.ord, "samples", shape="SampleType")
+ expect_warning(p2 <- plot_ordination(GP, GP.ord, "species", shape="SampleType"))
+ p3 <- plot_ordination(GP, GP.ord, "split", shape="SampleType")
+ p4 <- plot_ordination(GP, GP.ord, "biplot", shape="SampleType")
+ # ggplot-class tests
+ expect_is(p1, "ggplot")
+ expect_is(p2, "ggplot")
+ expect_is(p3, "ggplot")
+ expect_is(p4, "ggplot")
+ expect_is(print(p1), "list")
+ expect_is(print(p2), "list")
+ expect_is(print(p3), "list")
+ expect_is(print(p4), "list")
+})
+
+test_that("plot_ordination: When variables are present or not, label SamplyType", {
+ p1 <- plot_ordination(GP, GP.ord, "samples", label="SampleType")
+ expect_warning(p2 <- plot_ordination(GP, GP.ord, "species", label="SampleType"))
+ p3 <- plot_ordination(GP, GP.ord, "split", label="SampleType")
+ p4 <- plot_ordination(GP, GP.ord, "biplot", label="SampleType")
+ # ggplot-class tests
+ expect_is(p1, "ggplot")
+ expect_is(p2, "ggplot")
+ expect_is(p3, "ggplot")
+ expect_is(p4, "ggplot")
+ expect_is(print(p1), "list")
+ expect_is(print(p2), "list")
+ expect_is(print(p3), "list")
+ expect_is(print(p4), "list")
+})
+
+test_that("plot_ordination: Continuous variables still mapped, uses added dummy variable", {
+ # Add the fake continuous variable
+ sample_data(GP)$OMEGA3_FA_CONC <- sample(1:100, nsamples(GP))
+ expect_is(p1 <- plot_ordination(GP, GP.ord, "samples", color="OMEGA3_FA_CONC"), "ggplot")
+ # Continuous variable cannot be mapped to shape. This is a ggplot object,
+ # but will throw error when 'printed'
+ p2 <- plot_ordination(GP, GP.ord, "samples", shape="OMEGA3_FA_CONC")
+ expect_is(p2, "ggplot")
+ expect_error(print(p2))
+ # A `label` can be mapped to continuous var. It is coerced to char and printed.
+ expect_is(p3 <- plot_ordination(GP, GP.ord, "samples", label="OMEGA3_FA_CONC"), "ggplot")
+ expect_that(print(p1), is_a("list"))
+ #expect_that(print(p2), is_a("list"))
+ expect_that(print(p3), is_a("list"))
+})
+
+test_that("plot_ordination: Some additional formats and warnings.", {
+ GP.ord.cca = ordinate(GP, "CCA")
+ GP.ord.mdsbray = ordinate(GP, "MDS", "bray")
+ expect_is(p1 <- plot_ordination(GP, GP.ord.mdsbray, type="TaXa", color="Phylum", title="p1"), "ggplot")
+ expect_is(p2 <- plot_ordination(GP, GP.ord.mdsbray, type="SpLit", color="Phylum", title="p2"), "ggplot")
+ expect_warning(p3 <- plot_ordination(GP, GP.ord.mdsbray, type="SamplE", color="Kingdom", title="p3"))
+ expect_is(p3, "ggplot")
+ expect_is(p4 <- plot_ordination(GP, GP.ord.cca, type="TaXa", color="Kingdom", title="p4"), "ggplot")
+ expect_is(p5 <- plot_ordination(GP, GP.ord.cca, type="samPle", color="SampleType", title="p5"), "ggplot")
+ expect_is(p6 <- plot_ordination(GP, GP.ord.cca, type="biplot", color="SampleType", title="p6"), "ggplot")
+ expect_is(p7 <- plot_ordination(GP, GP.ord.cca, type="biplot",
+ label="X.SampleID", color="SampleType", title="p7"), "ggplot")
+ expect_is(p7b <- plot_ordination(GP, GP.ord.cca, type="biplot",
+ label="X.SampleID", color=NULL, title="p7b"), "ggplot")
+ expect_is(p7c <- plot_ordination(GP, GP.ord.cca, type="biplot",
+ label="Phylum", color=NULL, title="p7c"), "ggplot")
+ expect_is(p7d <- plot_ordination(GP, GP.ord.cca, type="biplot",
+ label="Phylum", color="SampleType", title="p7d"), "ggplot")
+ expect_is(p8 <- plot_ordination(GP, GP.ord.cca, type="scree",
+ label="X.SampleID", color="SampleType", title="p8"), "ggplot")
+ expect_is(p9 <- plot_ordination(GP, GP.ord.cca, type=" sPlit __ ",
+ label="Phylum", color="SampleType", title="p8"), "ggplot")
+ expect_that(print(p1), is_a("list"))
+ expect_that(print(p2), is_a("list"))
+ expect_that(print(p3), is_a("list"))
+ expect_that(print(p4), is_a("list"))
+ expect_that(print(p5), is_a("list"))
+ expect_that(print(p6), is_a("list"))
+ expect_that(print(p7), is_a("list"))
+ expect_that(print(p7b), is_a("list"))
+ expect_that(print(p7c), is_a("list"))
+ expect_that(print(p7d), is_a("list"))
+ expect_that(print(p8), is_a("list"))
+ expect_that(print(p9), is_a("list"))
+ # A few more related to new `wascores` support as default backup coordinates
+ xnames = tapply(taxa_sums(GlobalPatterns), tax_table(GlobalPatterns)[, "Phylum"], sum)
+ xnames <- names(sort(xnames, decreasing = TRUE))[1:5]
+ GP = prune_taxa(taxa_sums(GlobalPatterns) > 1E4, GlobalPatterns)
+ GP <- prune_taxa(tax_table(GP)[, "Phylum"] %in% xnames, GP)
+ x = ordinate(GP, method = "MDS", distance = "unifrac", weighted=TRUE)
+ y = ordinate(GP, method = "CCA")
+ z = ordinate(GP, method = "CAP", "unifrac", ~SampleType)
+ z1 = ordinate(GP, method = "CAP", "bray", ~SampleType)
+ # Try a bunch more with splits and biplots
+ expect_is(p11 <- plot_ordination(GP, x, type = "biplot", color="Phylum"), "ggplot")
+ expect_is(print(p11), "list")
+ expect_is(p12 <- plot_ordination(GP, x, type = "biplot", color="SampleType", shape="Phylum"), "ggplot")
+ expect_is(print(p12), "list")
+ expect_is(p13 <- plot_ordination(GP, x, type = "split", color="SampleType", shape="Phylum"), "ggplot")
+ expect_is(print(p13), "list")
+ expect_is(p14 <- plot_ordination(GP, x, type = "split", color="Phylum"), "ggplot")
+ expect_is(print(p14), "list")
+ expect_is(p15 <- plot_ordination(GP, y, type = "biplot", color="Phylum"), "ggplot")
+ expect_is(print(p15), "list")
+ expect_is(p16 <- plot_ordination(GP, y, type = "species", color="Phylum"), "ggplot")
+ expect_is(print(p16), "list")
+ expect_is(p17 <- plot_ordination(GP, z, type = "biplot", color="Phylum"), "ggplot")
+ expect_is(print(p17), "list")
+ expect_is(p18 <- plot_ordination(GP, z, type = "biplot", color="SampleType", shape="Phylum"), "ggplot")
+ expect_is(print(p18), "list")
+ expect_is(p19 <- plot_ordination(GP, z1, type = "biplot", color="Phylum"), "ggplot")
+ expect_is(print(p19), "list")
+})
+
+test_that("plot_ordination: CAP method", {
+ # Works with a named formula argument
+ GP.ord.cap1 = ordinate(GP, method="CAP", distance="bray", formula=~SampleType)
+ expect_is(GP.ord.cap1, "capscale")
+ # Works without naming the formula argument
+ GP.ord.cap2 = ordinate(GP, method="CAP", distance="bray", ~SampleType)
+ expect_is(GP.ord.cap2, "capscale")
+ expect_equivalent(GP.ord.cap1, GP.ord.cap2)
+ # Works with precomputed distance matrix
+ Dist = distance(GP, "bray", type="samples")
+ GP.ord.cap3 = ordinate(physeq=GP, method="CAP", distance=Dist, ~SampleType)
+ expect_is(GP.ord.cap3, "capscale")
+ # Can't expect equivalent b/c pre-computed distance
+ # won't carryover any species/taxa scores.
+ #expect_equivalent(GP.ord.cap1, GP.ord.cap3)
+ GP.ord.cap = ordinate(GP, method="CAP", distance="bray", formula=~SampleType)
+ expect_is(GP.ord.cap, "capscale")
+ expect_is(p4 <- plot_ordination(GP, GP.ord.cap, type="TaXa",
+ color="Phylum", title="p4"), "ggplot")
+ expect_is(p5 <- plot_ordination(GP, GP.ord.cap, type="samPle",
+ color="SampleType", title="p5"), "ggplot")
+ expect_is(p6 <- plot_ordination(GP, GP.ord.cap, type="biplot",
+ color="SampleType", title="p6"), "ggplot")
+ expect_is(p7 <- plot_ordination(GP, GP.ord.cap, type="biplot", label="X.SampleID",
+ color="SampleType", title="p7"), "ggplot")
+ expect_is(p7b <- plot_ordination(GP, GP.ord.cap, type="biplot", label="X.SampleID",
+ color=NULL, title="p7b"), "ggplot")
+ expect_is(p7c <- plot_ordination(GP, GP.ord.cap, type="biplot",
+ label="Phylum", color=NULL, title="p7c"), "ggplot")
+ expect_is(p7d <- plot_ordination(GP, GP.ord.cap, type="biplot", label="Phylum",
+ color="SampleType", title="p7d"), "ggplot")
+ expect_is(p8 <- plot_ordination(GP, GP.ord.cap, type="scree", label="X.SampleID",
+ color="SampleType", title="p8"), "ggplot")
+ expect_is(p9 <- plot_ordination(GP, GP.ord.cap, type=" sPlit __ ", label="Phylum",
+ color="SampleType", title="p8"), "ggplot")
+ expect_that(print(p4), is_a("list"))
+ expect_that(print(p5), is_a("list"))
+ expect_that(print(p6), is_a("list"))
+ expect_that(print(p7), is_a("list"))
+ expect_that(print(p7b), is_a("list"))
+ expect_that(print(p7c), is_a("list"))
+ expect_that(print(p7d), is_a("list"))
+ expect_that(print(p8), is_a("list"))
+ expect_that(print(p9), is_a("list"))
+})
+
+# Constrained CCA / RDA
+test_that("plot_ordination: CCA, RDA method", {
+ # Constrained RDA and CCA both work.
+ GP.ord.cca = ordinate(GP, "CCA", NULL, formula=~SampleType)
+ expect_is(GP.ord.cca, "cca")
+ GP.ord.rda = ordinate(GP, "RDA", NULL, formula=~SampleType)
+ expect_is(GP.ord.rda, "rda")
+ # Test plotting CCA
+ expect_is(p4 <- plot_ordination(GP, GP.ord.cca, type="TaXa",
+ color="Phylum", title="p4"), "ggplot")
+ expect_is(p5 <- plot_ordination(GP, GP.ord.cca, type="samPle",
+ color="SampleType", title="p5"), "ggplot")
+ expect_is(p6 <- plot_ordination(GP, GP.ord.cca, type="biplot",
+ color="SampleType", title="p6"), "ggplot")
+ expect_is(p7 <- plot_ordination(GP, GP.ord.cca, type="biplot", label="X.SampleID",
+ color="SampleType", title="p7"), "ggplot")
+ expect_is(p7b <- plot_ordination(GP, GP.ord.cca, type="biplot", label="X.SampleID",
+ color=NULL, title="p7b"), "ggplot")
+ expect_is(p7c <- plot_ordination(GP, GP.ord.cca, type="biplot",
+ label="Phylum", color=NULL, title="p7c"), "ggplot")
+ expect_is(p7d <- plot_ordination(GP, GP.ord.cca, type="biplot", label="Phylum",
+ color="SampleType", title="p7d"), "ggplot")
+ expect_is(p8 <- plot_ordination(GP, GP.ord.cca, type="scree", label="X.SampleID",
+ color="SampleType", title="p8"), "ggplot")
+ expect_is(p9 <- plot_ordination(GP, GP.ord.cca, type=" sPlit __ ", label="Phylum",
+ color="SampleType", title="p8"), "ggplot")
+ expect_that(print(p4), is_a("list"))
+ expect_that(print(p5), is_a("list"))
+ expect_that(print(p6), is_a("list"))
+ expect_that(print(p7), is_a("list"))
+ expect_that(print(p7b), is_a("list"))
+ expect_that(print(p7c), is_a("list"))
+ expect_that(print(p7d), is_a("list"))
+ expect_that(print(p8), is_a("list"))
+ expect_that(print(p9), is_a("list"))
+ # Repeat test-plotting RDA
+ expect_is(p4 <- plot_ordination(GP, GP.ord.rda, type="TaXa",
+ color="Phylum", title="p4"), "ggplot")
+ expect_is(p5 <- plot_ordination(GP, GP.ord.rda, type="samPle",
+ color="SampleType", title="p5"), "ggplot")
+ expect_is(p6 <- plot_ordination(GP, GP.ord.rda, type="biplot",
+ color="SampleType", title="p6"), "ggplot")
+ expect_is(p7 <- plot_ordination(GP, GP.ord.rda, type="biplot", label="X.SampleID",
+ color="SampleType", title="p7"), "ggplot")
+ expect_is(p7b <- plot_ordination(GP, GP.ord.rda, type="biplot", label="X.SampleID",
+ color=NULL, title="p7b"), "ggplot")
+ expect_is(p7c <- plot_ordination(GP, GP.ord.rda, type="biplot",
+ label="Phylum", color=NULL, title="p7c"), "ggplot")
+ expect_is(p7d <- plot_ordination(GP, GP.ord.rda, type="biplot", label="Phylum",
+ color="SampleType", title="p7d"), "ggplot")
+ expect_is(p8 <- plot_ordination(GP, GP.ord.rda, type="scree", label="X.SampleID",
+ color="SampleType", title="p8"), "ggplot")
+ expect_is(p9 <- plot_ordination(GP, GP.ord.rda, type=" sPlit __ ", label="Phylum",
+ color="SampleType", title="p8"), "ggplot")
+ expect_that(print(p4), is_a("list"))
+ expect_that(print(p5), is_a("list"))
+ expect_that(print(p6), is_a("list"))
+ expect_that(print(p7), is_a("list"))
+ expect_that(print(p7b), is_a("list"))
+ expect_that(print(p7c), is_a("list"))
+ expect_that(print(p7d), is_a("list"))
+ expect_that(print(p8), is_a("list"))
+ expect_that(print(p9), is_a("list"))
+})
+################################################################################
+# Other plot function tests...
+################################################################################
+# plot_richness tests
+################################################################################
+test_that("estimate_richness: test values, classes", {
+ data("soilrep")
+ data("GlobalPatterns")
+ # Default is all available measures
+ erdf = estimate_richness(soilrep)
+ expect_is(erdf, "data.frame")
+ expect_equivalent(nrow(erdf), 56)
+ # Contains all expected measures
+ expect_true(all(c("Observed", "Chao1", "ACE", "Shannon", "Simpson", "InvSimpson", "Fisher") %in% colnames(erdf)))
+ # and certain standard errors:
+ expect_true(all(c("se.chao1", "se.ACE") %in% colnames(erdf)))
+ # Test some values.
+ expect_equivalent(erdf$Observed, apply(otu_table(soilrep), 2, function(x){sum(x>0)}))
+ expect_equivalent(estimate_richness(GlobalPatterns, measures="Observed")[, 1],
+ apply(otu_table(GlobalPatterns), 2, function(x){sum(x>0)}))
+ # Calculate "manually" the values that should be Chao1, compare with result.
+ S_0 = apply(otu_table(soilrep), 2, function(x){sum(x>0)})
+ a1 = apply(otu_table(soilrep), 2, function(x){sum(x==1)})
+ a2 = apply(otu_table(soilrep), 2, function(x){sum(x==2)})
+ S_P = S_0 + a1*(a1-1)/(2*(a2+1))
+ expect_equivalent(round(S_P, 4), round(estimate_richness(soilrep, measures="Chao1")[, "Chao1"], 4))
+ # Expect a data.frame, even with just one column
+ expect_is(estimate_richness(soilrep, measures="Observed"), "data.frame")
+ # Specify a few:
+ x = estimate_richness(soilrep, measures=c("Observed", "InvSimpson", "Shannon", "Chao1"))
+ expect_equivalent(round(x[1:5, "Shannon"], 4),
+ round(c(6.540578, 6.715170, 6.948412, 7.343088, 6.838917), 4))
+})
+
+test_that("plot_richness: Standard plots work", {
+ data("soilrep")
+ p = plot_richness(soilrep)
+ expect_is(p, "ggplot")
+ expect_equivalent(levels(p$data$variable),
+ c("Observed", "Chao1", "ACE", "Shannon", "Simpson", "InvSimpson", "Fisher"))
+ expect_false(all(is.na(p$data$se)))
+ expect_true(any(is.na(p$data$se)))
+ p = plot_richness(soilrep, measures=c("Observed", "Chao1"))
+ expect_is(p, "ggplot")
+ expect_equivalent(levels(p$data$variable), c("Observed", "Chao1"))
+})
+
+test_that("plot_richness: sortby argument works correctly", {
+ data("soilrep")
+ # sortby must be among the `measures`.
+ # Should throw warning if not, but still produce a plot.
+ expect_warning({p1 <- plot_richness(soilrep, sortby="Treatment")})
+ expect_is(p1, "ggplot")
+ # sortby is only relevant if `x` argument is discrete.
+ # Should throw warning if not, but still produce a plot.
+ # First add dummy numeric sample variable
+ sample_data(soilrep)$dummy <- runif(nsamples(soilrep))
+ expect_warning({p2 <- plot_richness(soilrep, x="dummy", sortby="Chao1")})
+ expect_is(p2, "ggplot")
+ # Default `x` is "samples", always discrete
+ p3 = plot_richness(soilrep, sortby="Chao1")
+ expect_equivalent(levels(p3$data$samples)[1:5],
+ c("a_C137", "a_C145", "a_C126", "a_C156", "a_C139"))
+ expect_is(p3, "ggplot")
+ # Make sure the discrete aggregation sort gets the order correct as well.
+ p4 = plot_richness(soilrep, x="Treatment", sortby="Simpson")
+ expect_equivalent(levels(p4$data$Treatment), c("UC", "WC", "WU", "UU"))
+ expect_is(p4, "ggplot")
+})
+
+test_that("plot_richness/estimate_richness: fisher.alpha", {
+ data("GlobalPatterns")
+ data("soilrep")
+ p = plot_richness(soilrep, measures="Fisher")
+ expect_is(p, "ggplot")
+ expect_is(p123123 <- plot_richness(GlobalPatterns, measures="Fisher"), "ggplot")
+ expect_equivalent(levels(p123123$data$variable), "Fisher")
+})
+################################################################################
+# Test psmelt properly protects against various name collisions
+################################################################################
+test_that("psmelt properly protects against various name collisions", {
+ data("GlobalPatterns")
+ gp.ch = subset_taxa(GlobalPatterns, Phylum == "Chlamydiae")
+ ps1 = NULL
+ gp1 = gp.ch
+ # type-1a conflict, Abundance
+ sample_data(gp1)$Abundance <- paste0("Sa-", 1:nsamples(gp1))
+ expect_warning(ps1 <- psmelt(gp1))
+ expect_equal(colnames(ps1)[1:3], c("OTU", "Sample", "Abundance"))
+ expect_equal(dim(ps1), c(546L, 18L))
+ expect_true("sample_Abundance" %in% colnames(ps1))
+ # A different type-1a conflict, OTU
+ ps1 = NULL
+ gp1 = gp.ch
+ sample_data(gp1)$OTU <- paste0("Sa-", 1:nsamples(gp1))
+ expect_warning(ps1 <- psmelt(gp1))
+ expect_equal(colnames(ps1)[1:3], c("OTU", "Sample", "Abundance"))
+ expect_equal(dim(ps1), c(546L, 18L))
+ expect_true("sample_OTU" %in% colnames(ps1))
+ # A different type-1a conflict, Sample
+ ps1 = NULL
+ gp1 = gp.ch
+ sample_data(gp1)$Sample <- paste0("Sa-", 1:nsamples(gp1))
+ expect_warning(ps1 <- psmelt(gp1))
+ expect_equal(colnames(ps1)[1:3], c("OTU", "Sample", "Abundance"))
+ expect_equal(dim(ps1), c(546L, 18L))
+ expect_true("sample_Sample" %in% colnames(ps1))
+ # type-1b conflict. rank_names conflict with special variables
+ ps1 = NULL
+ gp1 = gp.ch
+ tax_table(gp1) <- cbind(tax_table(gp1), Sample=paste0("ta", taxa_names(gp1)))
+ expect_warning(ps1 <- psmelt(gp1))
+ expect_equal(colnames(ps1)[1:3], c("OTU", "Sample", "Abundance"))
+ expect_equal(dim(ps1), c(546L, 18L))
+ expect_true("taxa_Sample" %in% colnames(ps1))
+ # type-2 conflict. Variable collision between rank_names and sample_data
+ ps1 = NULL
+ gp1 = gp.ch
+ tax_table(gp1) <- cbind(tax_table(gp1), Primer=paste0("ta", taxa_names(gp1)))
+ expect_warning(ps1 <- psmelt(gp1))
+ expect_equal(colnames(ps1)[1:3], c("OTU", "Sample", "Abundance"))
+ expect_equal(dim(ps1), c(546L, 18L))
+ expect_true("sample_Primer" %in% colnames(ps1))
+ # All conflict types at once.
+ ps1 = NULL
+ gp1 = gp.ch
+ sample_data(gp1)$Abundance <- paste0("Sa-", 1:nsamples(gp1))
+ sample_data(gp1)$OTU <- paste0("Sa-", 1:nsamples(gp1))
+ sample_data(gp1)$Sample <- paste0("Sa-", 1:nsamples(gp1))
+ tax_table(gp1) <- cbind(tax_table(gp1), Sample=paste0("ta", taxa_names(gp1)))
+ tax_table(gp1) <- cbind(tax_table(gp1), Primer=paste0("ta", taxa_names(gp1)))
+ expect_warning(ps1 <- psmelt(gp1))
+ expect_equal(colnames(ps1)[1:3], c("OTU", "Sample", "Abundance"))
+ expect_equal(dim(ps1), c(546L, 22L))
+ newvars = c("sample_OTU", "sample_Sample", "sample_Abundance",
+ "sample_Primer", "taxa_Sample")
+ expect_true(all(newvars %in% colnames(ps1)))
+})
+################################################################################
+test_that("psmelt correctly handles phyloseq data with NULL components, and OTU tables", {
+ data("GlobalPatterns")
+ GP = prune_taxa(names(sort(taxa_sums(GlobalPatterns), TRUE)[1:50]), GlobalPatterns)
+ # The objects with NULL components
+ GPS = phyloseq(otu_table(GP), sample_data(GP), phy_tree(GP))
+ GPT = phyloseq(otu_table(GP), tax_table(GP), phy_tree(GP))
+ GPTr = phyloseq(otu_table(GP), phy_tree(GP))
+ GPN = otu_table(GP)
+ # Try psmelt directly. Should be no errors or warnings.
+ expect_is((testT <- psmelt(GPT)), "data.frame")
+ expect_is((testS <- psmelt(GPS)), "data.frame")
+ expect_is((testTr <- psmelt(GPTr)), "data.frame")
+ expect_is((testN <- psmelt(GPN)), "data.frame")
+ # Test values of the results.
+ expect_is(testT$Abundance, "numeric")
+ expect_is(testT$OTU, "character")
+ expect_is(testT$Sample, "character")
+ expect_equivalent(colnames(testT), c("OTU", "Sample", "Abundance", "Kingdom", "Phylum",
+ "Class", "Order", "Family", "Genus", "Species"))
+ expect_equivalent(colnames(testS), c("Sample", "OTU", "Abundance", "X.SampleID", "Primer",
+ "Final_Barcode", "Barcode_truncated_plus_T",
+ "Barcode_full_length", "SampleType", "Description"))
+ # Try psmelt via plot function that relies on it
+ expect_is(pS <- plot_tree(GPS, color="SampleType"), "ggplot")
+ expect_is(pT <- plot_tree(GPT, shape="Kingdom"), "ggplot")
+ expect_is(pTr <- plot_tree(GPTr), "ggplot")
+ expect_is(pN <- plot_bar(GPN), "ggplot")
+ expect_is((prPS<-print(pS)), "list")
+ expect_is((prPT<-print(pT)), "list")
+ expect_is((prPTr<-print(pTr)), "list")
+ expect_is((prPN<-print(pN)), "list")
+})
+test_that("psmelt doesn't break when the number of taxa is 1", {
+ data(GlobalPatterns)
+ # tree removal warning when prune to 1 OTU.
+ expect_warning(GP1 <- prune_taxa(taxa_names(GlobalPatterns)[1], GlobalPatterns))
+ expect_equal(ntaxa(GP1), 1)
+ df <- psmelt(GP1)
+ expect_is(df, 'data.frame')
+ reqnames = c("OTU", "Sample", "Abundance", "SampleType", "Kingdom", "Phylum")
+ expect_true(all(reqnames %in% names(df)))
+ expect_equivalent(sum(df$Abundance, na.rm = TRUE), taxa_sums(GP1))
+})
+################################################################################
diff --git a/tests/testthat/test-rarefy.R b/tests/testthat/test-rarefy.R
new file mode 100644
index 0000000..09675e5
--- /dev/null
+++ b/tests/testthat/test-rarefy.R
@@ -0,0 +1,61 @@
+################################################################################
+# Use testthat to test phyloseq transformation functions/methods
+################################################################################
+library("phyloseq"); library("testthat")
+# # # # TESTS!
+################################################################################
+# rarefy_even_depth
+################################################################################
+data("GlobalPatterns")
+set.seed(711) # The random seed for randomly selecting subset of OTUs
+randoOTUs = sample(taxa_names(GlobalPatterns), 100, FALSE)
+GP100 = prune_taxa(randoOTUs, GlobalPatterns)
+min_lib = 1000
+# The default rng seed is being implied in this call (also 711)
+rGP = suppressMessages(rarefy_even_depth(GP100, sample.size=min_lib, rngseed=FALSE))
+rGPr = suppressMessages(rarefy_even_depth(GP100, sample.size=min_lib, rngseed=FALSE, replace=FALSE))
+################################################################################
+# Test that specific OTUs and samples were removed
+################################################################################
+test_that("Test that empty OTUs and samples were automatically pruned", {
+ rmOTU = setdiff(taxa_names(GP100), taxa_names(rGP))
+ expect_equal(length(rmOTU), 20L)
+ expect_equal(rmOTU[1:5], c("534601", "408325", "325564", "8112", "571917"))
+ expect_true(taxa_names(GP100)[taxa_sums(GP100) <= 0] %in% rmOTU)
+ expect_true(all(taxa_sums(rGP) > 0))
+ rmsam = setdiff(sample_names(GP100), sample_names(rGP))
+ expect_equal(length(rmsam), 12L)
+ expect_equal(rmsam[1:5], c("M11Fcsw", "M31Tong", "M11Tong", "NP2", "TRRsed1"))
+ expect_true(all(sample_sums(rGP) > 0))
+})
+################################################################################
+# Test specific values. Should be reproducible, and you set the seed.
+################################################################################
+test_that("Test values", {
+ # with replacement values
+ expect_equal(as(otu_table(rGP)[1, 3:10], "vector"), rep(0, 8))
+ expect_equal(as(otu_table(rGP)[2, 1:10], "vector"), c(rep(0, 9), 2))
+ expect_equal(as(otu_table(rGP)[3, 8:12], "vector"), c(892, 956, 56, 10, 25))
+ expect_equal(as(otu_table(rGP)[70:78, 4], "vector"),
+ c(710, 2, 0, 2, 0, 8, 154, 2, 0))
+ # without replacement values
+ expect_equal(as(otu_table(rGPr)[1, 3:10], "vector"), c(rep(0, 7), 1))
+ expect_equal(as(otu_table(rGPr)[2, 1:10], "vector"),
+ c(rep(0, 5), 4, 0, 877, 960, 55))
+ expect_equal(as(otu_table(rGPr)[3, 8:12], "vector"),
+ c(10, 34, 2, 0, 2))
+ expect_equal(as(otu_table(rGPr)[70:78, 4], "vector"),
+ c(0, 706, 1, 0, 2, 0, 5, 173, 1))
+})
+################################################################################
+# Include tests from the rarefy-without-replacement results, used by many.
+#################################################################################
+test_that("Test library sizes are all the same set value", {
+ expect_true(all(sample_sums(rGP )==min_lib))
+ expect_true(all(sample_sums(rGPr)==min_lib))
+})
+test_that("The same samples should have been cut in each results", {
+ expect_equal(nsamples(rGP), 14)
+ expect_true(setequal(sample_names(rGP), sample_names(rGPr)))
+})
+################################################################################
\ No newline at end of file
diff --git a/tests/testthat/test-subset.R b/tests/testthat/test-subset.R
new file mode 100644
index 0000000..8cf855f
--- /dev/null
+++ b/tests/testthat/test-subset.R
@@ -0,0 +1,110 @@
+# load libraries
+library("phyloseq"); library("testthat")
+# # # # TESTS!
+set.seed(888)
+
+# Load GP dataset
+data("GlobalPatterns")
+GP <- GlobalPatterns
+keepNames <- sample_names(GP)[5:7]
+
+################################################################################
+# prune_
+################################################################################
+
+test_that("Classes of pruned phyloseq and its components are as expected", {
+ GP3 <- prune_samples(keepNames, GP)
+ expect_that(nsamples(GP3), is_identical_to(3L))
+ expect_that(GP3, is_a("phyloseq"))
+ expect_that(access(GP3, "sam_data"), is_a("sample_data"))
+ expect_that(access(GP3, "otu_table"), is_a("otu_table"))
+ expect_that(access(GP3, "phy_tree"), is_a("phylo"))
+ expect_that(access(GP3, "tax_table"), is_a("taxonomyTable"))
+ # Now try on instance without sample data (empty slot)
+ GPnoSD <- phyloseq(otu_table(GP), tax_table(GP))
+ GP3noSD <- prune_samples(keepNames, GPnoSD)
+ expect_that(nsamples(GP3noSD), is_identical_to(3L))
+ expect_that(access(GP3noSD, "otu_table"), is_a("otu_table"))
+ expect_that(access(GP3noSD, "sam_data"), is_a("NULL"))
+ expect_that(access(GP3noSD, "phy_tree"), is_a("NULL"))
+ expect_that(access(GP3noSD, "tax_table"), is_a("taxonomyTable"))
+})
+
+test_that("prune_samples works on sample_data-only and otu_table-only data", {
+ GPotu <- prune_samples(keepNames, access(GP, "otu_table", TRUE))
+ GPsd <- prune_samples(keepNames, access(GP, "sam_data", TRUE))
+ expect_that(nsamples(GPotu), is_identical_to(3L))
+ expect_that(nsamples(GPsd), is_identical_to(3L))
+ expect_that(GPotu, is_a("otu_table"))
+ expect_that(GPsd, is_a("sample_data"))
+ expect_that(dim(GPotu), is_identical_to(c(19216L, 3L)))
+ expect_that(dim(GPsd), is_identical_to(c(3L, 7L)))
+})
+
+# Coerce orientation for apply
+if(taxa_are_rows(GP)){
+ otumat = as(otu_table(GP), "matrix")
+} else {
+ otumat = t(as(otu_table(GP), "matrix"))
+}
+# Count in how many samples each OTU was observed more than 5 times.
+samobs = apply(otumat, 1, function(x, m) sum(x > m), m=5L)
+# Keep only the most prevalent 50 of these
+samobs = sort(samobs, TRUE)[1:50]
+# Shuffle the names on purpose.
+samobs = sample(samobs, length(samobs), FALSE)
+
+test_that("Initial order before pruning check is different", {
+ expect_that(setequal(names(samobs), taxa_names(phy_tree(GP))[1:50]), is_false())
+ expect_that(setequal(names(samobs), taxa_names(GP)[1:50]), is_false())
+ expect_that(identical(names(samobs), taxa_names(GP)[1:50]), is_false())
+})
+
+# prune to just samobs OTUs
+pGP = prune_taxa(names(samobs), GP)
+
+test_that("The set of names should be the same after pruning, names(samobs)", {
+ expect_that(setequal(names(samobs), taxa_names(phy_tree(pGP))), is_true())
+ expect_that(setequal(names(samobs), taxa_names(otu_table(pGP))), is_true())
+ expect_that(setequal(names(samobs), taxa_names(tax_table(pGP))), is_true())
+})
+
+test_that("The set/order of taxa names after pruning should be consistent", {
+ # set equal
+ expect_that(setequal(taxa_names(pGP), taxa_names(phy_tree(pGP))), is_true())
+ expect_that(setequal(taxa_names(otu_table(pGP)), taxa_names(phy_tree(pGP))), is_true())
+ expect_that(setequal(taxa_names(tax_table(pGP)), taxa_names(phy_tree(pGP))), is_true())
+ # identical
+ expect_that(identical(taxa_names(pGP), taxa_names(phy_tree(pGP))), is_true())
+ expect_that(identical(taxa_names(otu_table(pGP)), taxa_names(phy_tree(pGP))), is_true())
+ expect_that(identical(taxa_names(tax_table(pGP)), taxa_names(phy_tree(pGP))), is_true())
+ expect_that(identical(names(samobs), taxa_names(phy_tree(pGP))), is_false())
+ # plot_tree(pGP, "sampledodge", nodeplotblank, label.tips="taxa_names", plot.margin=0.75)
+})
+
+## Add this as backup test
+#' data("esophagus")
+#' esophagus
+#' plot(sort(taxa_sums(esophagus), TRUE), type="h", ylim=c(0, 50))
+#' x1 = prune_taxa(taxa_sums(esophagus) > 10, esophagus)
+#' x2 = prune_taxa(names(sort(taxa_sums(esophagus), TRUE))[1:9], esophagus)
+#' identical(x1, x2)
+
+
+################################################################################
+# test filter_taxa and other filter methods.
+################################################################################
+library("genefilter")
+data("enterotype")
+
+test_that("filter_taxa gives correct, reliable logicals and pruning", {
+ flist <- filterfun(kOverA(5, 2e-05))
+ ent.logi <- filter_taxa(enterotype, flist)
+ expect_that(ent.logi, is_a("logical"))
+ ent.trim <- filter_taxa(enterotype, flist, TRUE)
+ expect_that(ent.trim, is_a("phyloseq"))
+ expect_that(sum(ent.logi), equals(ntaxa(ent.trim)))
+ expect_that(prune_taxa(ent.logi, enterotype), is_identical_to(ent.trim))
+ expect_that(ntaxa(ent.trim), equals(416L))
+ expect_that(nsamples(ent.trim), equals(nsamples(enterotype)))
+})
diff --git a/tests/testthat/test-transform.R b/tests/testthat/test-transform.R
new file mode 100644
index 0000000..b018767
--- /dev/null
+++ b/tests/testthat/test-transform.R
@@ -0,0 +1,100 @@
+################################################################################
+# Use testthat to test phyloseq transformation functions/methods
+################################################################################
+library("phyloseq"); library("testthat")
+# # # # TESTS!
+#set.seed(8888)
+
+################################################################################
+test_that("Can transform_sample_counts of an OTU table that is either orientation", {
+ data("esophagus")
+ OTU0 = otu_table(esophagus)
+ OTU1 = transform_sample_counts(OTU0, rank)
+ OTU2 = transform_sample_counts(t(OTU0), rank)
+ expect_that(identical(ntaxa(OTU0), ntaxa(OTU1)), is_true(),
+ "ntaxa OTU1 doesn't match original after transformation.")
+ expect_that(identical(ntaxa(OTU0), ntaxa(OTU2)), is_true(),
+ "ntaxa OTU2 doesn't match original after transformation.")
+})
+test_that("Can transform_sample_counts on phyloseq with either orientation", {
+ data("esophagus")
+ eso1 = eso2 = NULL
+ try(eso1 <- transform_sample_counts(esophagus, rank), TRUE)
+ try(eso2 <- transform_sample_counts(t(esophagus), rank), TRUE)
+ expect_false(is.null(eso1), "eso1 is NULL, valid phyloseq construction failed.")
+ expect_false(is.null(eso2), "eso2 is NULL, valid phyloseq construction failed.")
+ expect_is(eso1, "phyloseq", "class of eso1 is not phyloseq")
+ expect_is(eso2, "phyloseq", "class of eso2 is not phyloseq")
+ expect_equal(ntaxa(esophagus), ntaxa(eso1), info="ntaxa eso1 doesn't match original after transformation.")
+ expect_equal(ntaxa(esophagus), ntaxa(eso2), info="ntaxa eso2 doesn't match original after transformation.")
+})
+
+test_that("Test transform_sample_counts edge-cases", {
+ data("esophagus")
+ # Randomly pick the OTU and sample names to use for subsetting
+ # (Will be different each time tests are run)
+ OTUname1 = sample(taxa_names(esophagus), 1)
+ samname1 = sample(sample_names(esophagus), 1)
+ # Test that a one-OTU dataset still works
+ # It throws a warning because the tree is being removed when it becomes just one tip.
+ #suppressWarnings(eso1otu <- prune_taxa(sample(taxa_names(esophagus), 1), esophagus))
+ expect_warning(try(eso1otu <- prune_taxa(OTUname1, esophagus), TRUE))
+ try(eso1sam <- prune_samples(samname1, esophagus), TRUE)
+ # Test eso1otu
+ expect_equal(ntaxa(eso1otu), 1L)
+ expect_equal(nsamples(eso1otu), 3L)
+ # Behavior strange when tree removed by necessity.
+ # In this case a "phyloseq" object with just an OTU table.
+ # Really, this should just be considered an object of class "otu_table".
+ #expect_is(eso1otu, "phyloseq", "pruned-to-1-OTU esophagus not phyloseq")
+ eso1otu = otu_table(eso1otu)
+ expect_is(eso1otu, "otu_table")
+ expect_equal(dim(otu_table(eso1otu)), c(1L, nsamples(esophagus)))
+ # Test eso1sam
+ expect_is(eso1sam, "phyloseq", "pruned-to-1-sample esophagus not phyloseq")
+ expect_is(otu_table(eso1sam), "otu_table",
+ "pruned-to-1-sample esophagus OTU table not otu_table")
+ expect_equal(nsamples(eso1sam), 1L)
+ expect_equal(dim(otu_table(eso1sam)), c(ntaxa(esophagus), 1L))
+ # Now test transform_sample_counts
+ eso1samrank = eso1oturank = NULL
+ try(eso1samrank <- transform_sample_counts(eso1sam, rank), TRUE)
+ try(eso1samrankt <- transform_sample_counts(t(eso1sam), rank), TRUE)
+ try(eso1oturank <- transform_sample_counts(eso1otu, rank), TRUE)
+ try(eso1oturankt <- transform_sample_counts(t(eso1otu), rank), TRUE)
+ expect_is(eso1samrank, "phyloseq")
+ expect_is(eso1samrankt, "phyloseq")
+ expect_is(eso1oturank, "otu_table")
+ expect_is(eso1oturankt, "otu_table")
+})
+test_that("Test transform_sample_counts numerical result accuracy", {
+ data("esophagus")
+ es = esophagus
+ # addition
+ es1 = transform_sample_counts(es, function(x) x + 1)
+ expect_equal(otu_table(es1), otu_table(es) + 1, tolerance=0.1, info="addition fail")
+ # multiplication
+ es1 = transform_sample_counts(es, function(x) x * 2.5)
+ expect_equal(otu_table(es1), otu_table(es) * 2.5, tolerance=0.1, info="multiplication fail")
+ # element-wise exponentiation
+ es1 = transform_sample_counts(es, function(x) x ^ 2.5)
+ expect_equal(otu_table(es1), otu_table(es) ^ 2.5, tolerance=0.1, info="exponentiation fail")
+ # logarithm
+ es1 = transform_sample_counts(es, function(x) log(x+10) )
+ expect_equal(otu_table(es1), log(otu_table(es) + 10), tolerance=0.1, info="logarithm fail")
+ # Prune to a small subset. Need a test where "by-sample" matters. E.g. rank
+ es = prune_taxa(taxa_names(es)[1:5], esophagus)
+ es1 = transform_sample_counts(es, rank)
+ ans = c(5, 2, 4, 2, 2, 5, 2.5, 4, 2.5, 1, 5, 1.5, 1.5, 3.5, 3.5)
+ ans = otu_table(matrix(ans, ntaxa(es), nsamples(es), FALSE,
+ list(taxa_names(es), sample_names(es))), taxa_are_rows=TRUE)
+ expect_equal(otu_table(es1), ans, tolerance=0.1, info="rank fail")
+ # test where "by-sample" matters, after transpose
+ es = prune_taxa(taxa_names(esophagus)[1:5], t(esophagus))
+ es1 = transform_sample_counts(es, rank)
+ ans = c(5, 2, 4, 2, 2, 5, 2.5, 4, 2.5, 1, 5, 1.5, 1.5, 3.5, 3.5)
+ ans = otu_table(matrix(ans, nsamples(es), ntaxa(es), TRUE,
+ list(sample_names(es), taxa_names(es))), taxa_are_rows=FALSE)
+ expect_equal(otu_table(es1), ans, tolerance=0.1, info="rank fail")
+})
+
diff --git a/vignettes/import_qiime_directory_structure.jpg b/vignettes/import_qiime_directory_structure.jpg
new file mode 100644
index 0000000..43953b2
Binary files /dev/null and b/vignettes/import_qiime_directory_structure.jpg differ
diff --git a/vignettes/phyloseq-FAQ.Rmd b/vignettes/phyloseq-FAQ.Rmd
new file mode 100644
index 0000000..089c143
--- /dev/null
+++ b/vignettes/phyloseq-FAQ.Rmd
@@ -0,0 +1,562 @@
+---
+title: "phyloseq Frequently Asked Questions (FAQ)"
+date: "`r date()`"
+author: "Paul McMurdie and Susan Holmes"
+output:
+ BiocStyle::html_document:
+ fig_height: 7
+ fig_width: 10
+ toc: yes
+ toc_depth: 2
+ number_sections: true
+vignette: >
+ %\VignetteIndexEntry{phyloseq Frequently Asked Questions (FAQ)}
+ %\VignetteEngine{knitr::rmarkdown}
+ %\VignetteEncoding{UTF-8}
+---
+
+
+This vignette includes answers and supporting materials that address
+[frequently asked questions (FAQs)](https://en.wikipedia.org/wiki/FAQ),
+especially those posted on
+[the phyloseq issues tracker](https://github.com/joey711/phyloseq/issues).
+
+For most issues
+[the phyloseq issues tracker](https://github.com/joey711/phyloseq/issues)
+should suffice; but occasionally there are questions
+that are asked repeatedly enough that it becomes appropriate
+to canonize the answer here in this vignette.
+This is both
+(1) to help users find solutions more quickly, and
+(2) to mitigate redundancy on
+[the issues tracker](https://github.com/joey711/phyloseq/issues).
+
+All users are encouraged to perform a google search
+and review other questions/responses to both open and closed issues
+on [the phyloseq issues tracker](https://github.com/joey711/phyloseq/issues)
+before seeking an active response by posting a new issue.
+
+
+```{r, warning=FALSE, message=FALSE}
+library("phyloseq"); packageVersion("phyloseq")
+library("ggplot2"); packageVersion("ggplot2")
+theme_set(theme_bw())
+```
+
+
+# - I tried reading my biom file using phyloseq, but it didn’t work. What’s wrong?
+
+The most common cause for this errors
+is derived from a massive change to the way biom files are stored on disk.
+There are currently two "versions" of the biom-format,
+each of which stores data very differently.
+The original format -- and original support in phyloseq --
+was for biom-format version 1 based on [JSON](https://en.wikipedia.org/wiki/JSON).
+
+The latest version -- version 2 -- is based on the
+[HDF5](https://www.hdfgroup.org/HDF5/doc/UG/index.html) file format,
+and this new biom format version
+recently become the default file output format
+for popular workflows like QIIME.
+
+## Good News: HDF5-biom should be supported in next release
+
+The *biomformat* package is the Bioconductor incarnation
+of R package support for the biom file format,
+written by Paul McMurdie (phyloseq author)
+and Joseph Paulson (metagenomeSeq author).
+Although it has been available on GitHub and BioC-devel
+for many months now,
+the first release version of *biomformat*
+on Bioconductor will be in April 2016.
+In that same release, phyloseq will switch over
+from the JSON-only *biom* package hosted on CRAN
+to this new package, *biomformat*,
+which simultaneously supports biom files
+based on either HDF5 or JSON.
+
+This difference will be largely opaque to users,
+and phyloseq will "just work" after the next release in April.
+
+Use the `import_biom` function to read your recent
+QIIME or other biom-format data.
+
+Additional back details are described in
+[Issue 443](https://github.com/joey711/phyloseq/issues/443).
+
+## HDF5 (Version 2.0) biom-format: *biomformat*
+
+As just described,
+HDF5 biom format is currently supported
+in the development version of phyloseq,
+via the new beta/development package called *biomformat*
+on BioC-devel and GitHub:
+
+https://github.com/joey711/biomformat
+
+If you need to use HDF5-based biom format files **immediately**
+and cannot wait for the upcoming release,
+then you should install the development version
+of the *biomformat* package by following the instructions
+at the link above.
+
+## Not every data component is included in .biom files
+
+Even though the biom-format supports the self-annotated inclusion
+of major components like that taxonomy table and sample data table,
+many tools that generate biom-format files
+(like QIIME, MG-RAST, mothur, etc.)
+do not export this data, even if you provided
+the information in your data input files.
+The reason for this boggles me,
+and I've shared my views on this with QIIME developers,
+but there nevertheless seems to be no plan to include your sample data
+in the ouput biom file.
+
+Furthermore, even though I have proposed it to the biom-format team,
+there is currently no support (or timeline for support)
+for inclusion of a phylogenetic tree within a ".biom" file.
+
+A number of tutorials are available
+demonstrating how one can add components to a phyloseq object
+after it has been created/imported.
+The following tutorial is especially relevant
+
+http://joey711.github.io/phyloseq-demo/import-biom-sd-example.html
+
+Which makes use of the following functions:
+
+- `import_qiime_sample_data`
+- `merge_phyloseq`
+
+## Other issues related the biom-format
+
+There are a number of different Issue Tracker posts
+discussing this format with respect to phyloseq:
+
+https://github.com/joey711/phyloseq/issues/302
+
+https://github.com/joey711/phyloseq/issues/272
+
+https://github.com/joey711/phyloseq/issues/392
+
+[Issue 443](https://github.com/joey711/phyloseq/issues/443)
+has details for updated format.
+
+
+# - `microbio_me_qiime()` returned an error. What’s wrong?
+
+## The QIIME-DB Server is Permanently Down.
+
+The QIIME-DB server is permanently down.
+
+Users are suggested to migrate their queries over to Qiita.
+
+Indeed, the previous link to
+[microbio.me/qiime](http://www.microbio.me/qiime/index.psp)
+now sends users to the new Qiita website.
+
+## An interface to Qiita is Planned.
+
+Stay tuned. The Qiita API needs to be released by the Qiita developers first.
+The phyloseq developers have no control over this,
+as we are not affiliated directly with the QIIME developers.
+Once there is an official Qiita API with documentation,
+an interface for phyloseq will be added.
+
+We found the `microbio_me_qiime()` function
+to be very convenient while the QIIME-DB server lasted.
+Hopefully an equivalent is hosted soon.
+
+
+# - I want a phyloseq graphic that looks like...
+
+Great!
+
+**Every plot function in phyloseq returns a ggplot2 object**.
+When these objects are "printed" to standard output in an R session,
+for instance,
+
+```{r}
+data(esophagus)
+plot_tree(esophagus)
+```
+
+then the graphic is rendered in
+[the current graphic device](https://stat.ethz.ch/R-manual/R-devel/library/grDevices/html/Devices.html).
+
+Alternatively, if you save the object output from a phyloseq `plot_` function
+as a variable in your session,
+then you can further modify it, interactively, at your leisure.
+For instance,
+
+```{r}
+p1 = plot_tree(esophagus, color = "Sample")
+p1
+p1 +
+ ggtitle("This is my title.") +
+ annotate("text", 0.25, 3,
+ color = "orange",
+ label = "my annotation")
+```
+
+There are lots of ways for you to generate custom graphics
+with phyloseq as a starting point.
+
+The following sections list some of my favorites.
+
+## Modify the ggplot object yourself.
+
+For example,
+[the plot_ordination() examples tutorial](http://joey711.github.io/phyloseq/plot_ordination-examples.html)
+provides several examples of using additional ggplot2 commands
+to modify/customize the graphic encoded in the ggplot2 object
+returned by `plot_ordination`.
+
+[The ggplo2 documentation](http://docs.ggplot2.org/current/)
+is the current and canonical online reference
+for creating, modifying, and developing with ggplot2 objects.
+
+For simple changes to aesthetics and aesthetic mapping,
+[the aesthetic specifications vignette](http://docs.ggplot2.org/current/vignettes/ggplot2-specs.html)
+is a useful resource.
+
+
+## psmelt and ggplot2
+
+The `psmelt` function converts your phyloseq object
+into a table (`data.frame`)
+that is very friendly for defining a custom ggplot2 graphic.
+This function was originally created
+as an internal (not user-exposed) tool
+within phyloseq to enable
+a [DRY](https://en.wikipedia.org/wiki/Don%27t_repeat_yourself)
+approach to building ggplot2 graphics
+from microbiome data represented as phyloseq objects.
+
+When applicable, the phyloseq `plot_` family of functions
+use `psmelt`.
+This function is now a documented
+and user-accessible function in phyloseq --
+for the main purpose of enabling users
+to create their own ggplot2 graphics as needed.
+
+There are lots of great documentation examples for ggplot2 at
+
+- [the ggplot2 official documentation site](http://docs.ggplot2.org/current/),
+- [ggplot2 on StackOverflow](http://stackoverflow.com/tags/ggplot2), and
+- [phyloseq documentation pages](https://joey711.github.io/phyloseq/).
+
+The following are two very simple examples of using
+`psmelt` to define your own ggplot2 object "from scratch".
+It should be evident that you could include further ggplot2 commands
+to modify each plot further, as you see fit.
+
+```{r}
+data("esophagus")
+mdf = psmelt(esophagus)
+# Simple bar plot. See plot_bar() for more.
+ggplot(mdf, aes(x = Sample,
+ y = Abundance)) +
+ geom_bar(stat = "identity", position = "stack", color = "black")
+# Simple heat map. See plot_heatmap() for more.
+ggplot(mdf, aes(x = Sample,
+ y = OTU,
+ fill = Abundance)) +
+ geom_raster()
+```
+
+## Submit a Pull Request (Advanced)
+
+If your new custom plot function is awesome and you think others might use it,
+add it to the `"plot-methods.R"` source file
+and submit a pull request on GitHub.
+
+[GitHub Official Pull Request Documentation](https://help.github.com/articles/using-pull-requests/)
+
+Please include example and test code
+in the code included in your pull request.
+
+I'll try and add it to the package by the next release.
+I will also give you authorship credit in the function doc.
+See the "typo fix" section below for further details about GitHub pull requests...
+
+## Define a ggplot2 extension (Advanced)
+
+Development of new R functions/commands
+for creating/modifying new geometric objects
+is now formally documented in
+[the ggplot2 extension vignette](http://docs.ggplot2.org/current/vignettes/extending-ggplot2.html).
+
+This may be related to the previous section,
+in that your ggplot2 extension for phyloseq
+could be contributed to the phyloseq project as a pull request.
+
+
+# - There’s a typo in phyloseq documentation, tutorials, or vignettes
+
+This is something that is actually faster and less work
+for you to solve yourself
+and contribute back to the phyloseq package.
+For trivial typo fixes,
+I will quickly include your fixes into the package code.
+Sometimes I accept them on my cell phone
+while I'm still in bed.
+No wasted time on either end! :-)
+
+The point is that this should be simple,
+and is simple if you follow one of the following suggestions.
+
+## Fix the typo directly on GitHub
+
+GitHub now provides the option to make changes
+to code/text of a repository
+directly from your web browser through an in-page editor.
+This handles all the Git details for you.
+If you have a GitHub account and you're logged in,
+all you'd have to do is locate the file with the offending typo,
+then use the "edit" button to
+make the changes and
+send the to me as a pull request.
+
+## Minimal GIT and GitHub Exercise
+
+![](http://i.imgur.com/j9NYXiQ.png)
+
+(The following instructions are borrowed
+from [Yihui Xie's site about fixing typos](http://yihui.name/en/2013/06/fix-typo-in-documentation/))
+
+Alternatively, for those who want to try GIT and Github pull requests,
+which make it possible for you to contribute to open source
+and fix obvious problems with no questions being asked --
+just do it yourself, and send the changes to the original author(s) through Github.
+
+The official documentation for Github pull requests
+is a little bit verbose for beginners.
+Basically what you need to do for simple tasks are:
+
+1. click the Fork button and clone the repository in your own account;
+2. make the changes in your cloned version;
+3. push to your repository;
+4. click the Pull Request button to send a request to the original author;
+
+
+
+# - I read ["Waste Not, Want Not..."](http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1003531) but...
+
+Before getting to more specific issues,
+let's start by keeping appropriately separate the concept of
+
+- (1) denoising amplicon sequences, and/or denoising features in the contingency table, and
+- (2) standardization
+
+These two concepts have been often-conflated --
+mostly by purveyors of methods that use rarefying --
+wrongly insisting that rarefying is somehow addressing both problems
+and the matter is settled.
+Unfortunately rarefying is a very inefficient, noise-introducing method
+that poorly addresses the data analysis challenges that motivate either concept.
+
+DESeq2 and related solutions can help you address
+the need for standardization (e.g. differing library sizes)
+at a particular step in your analysis
+while still making efficient inferences from your data.
+
+The denoising problem is best addressed at the sequence-processing level,
+and the best general-purpose option currently available is:
+
+- [The dada2 algorithm](http://benjjneb.github.io/dada2/), if your data works well with it. Current support is mainly Illumina sequence data, or
+- [UPARSE](http://drive5.com/uparse/) in the usearch package, if you don't have sequencing data that works well with [dada2](http://benjjneb.github.io/dada2/)
+
+
+## I tried to [use DESeq2](http://joey711.github.io/phyloseq-extensions/DESeq2.html) to normalize my data, but now I don't know what to do...
+
+The answer to a question of this category depends a lot on your experiment,
+and what you want to learn from your data.
+The following are some resources that may help.
+
+- [Waste Not, Want Not Supplemental Materials](http://joey711.github.io/waste-not-supplemental/)
+- [Differential Abundance Vignette](https://www.bioconductor.org/packages/release/bioc/vignettes/phyloseq/inst/doc/phyloseq-mixture-models.html)
+- [The phyloseq front page](https://joey711.github.io/phyloseq/)
+
+
+## My libraries/samples had different total number of reads, what do I do?
+
+That is an expected artifact of current sequencing technologies,
+and not a "problem" on its own.
+In most cases, differences in total counts are uncorrelated
+with any variable in your experimental design.
+**You should check that this is the case**.
+It remains possible that there are structural/procedural artifacts
+in your experiment that have influenced the total counts.
+If library sizes are correlated with one of your design variables,
+then this *might* represent an artifact that you need to address more carefully.
+This is a decision that you will have to make and defend.
+No software package or workflow can address this for you,
+but phyloseq/R can certainly help you check for correlation.
+See the `sample_sums()` and `sample_data()` accessor functions.
+
+Other than the portent of structural biases in your experiment,
+you should recall that
+comparisons between observation classes that have
+**uneven sample sizes is not a new nor unsolved problem in statistics**.
+
+The most useful analytical methods you can use in this context
+are therefore methods that expect and account
+for differences in total number of reads between samples.
+
+How you account for these *library size* differences
+should depend on the type of analysis in which you are engaged,
+and which methods you plan to use.
+For instance, for a beta-diversity measure like
+Bray-Curtis Dissimilarity,
+you might simply use the relative abundance of each taxa in each sample,
+as the absolute counts are not appropriate to use directly
+in the context where count differences are not meaningful.
+
+For further information, see
+
+- ["Waste Not, Want Not..."](http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1003531)
+- [Discussion for Issue 229](https://github.com/joey711/phyloseq/issues/229)
+- [Discussion for Issue 299](https://github.com/joey711/phyloseq/issues/299)
+
+## Should I normalize my data before alpha-diversity analysis
+
+**No.** Generally speaking, the answer is **no**.
+Most alpha diversity methods will be most effective
+when provided with the originally-observed count values.
+
+The misleading notion --
+that normalization is necessary
+prior to alpha-diversity analysis --
+seems to be derived from various
+"one size fits all" pipeline tools like QIIME,
+in which it is often encouraged to
+[*rarefy*](http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1003531)
+counts as a normalizing transformation prior to any/all analysis.
+While this may simplify certain aspects of pipeline software development,
+it is analytical and statistical folly.
+**Rarefying microbiome data is statistically inadmissible**.
+
+For further information, I suggest reviewing literature such as
+
+- [Gotelli Colwell (2001)](http://onlinelibrary.wiley.com/doi/10.1046/j.1461-0248.2001.00230.x/abstract;jsessionid=A5EF264ABB5EADD5CCE9EF3AEE50CA41.f01t03), and of course,
+- ["Waste Not, Want Not..."](http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1003531)
+
+
+## Negative numbers in my transformed data table?
+
+This sort of question usually appears after someone used
+a log-like transformation / variance stabilizing transformation
+on their data,
+in preparation for an exploratory analysis via ordination.
+Negative values in this context probably correspond
+to **"less than one count"** after rescaling.
+For many ordination methods,
+like [PCA](https://en.wikipedia.org/wiki/Principal_component_analysis),
+negative numbers are not a problem.
+
+Instead, the problem is often posed because a user
+also wants to use **a particular distance measure**
+that is undefined or unstable in the presence of negative entries.
+In this context, however, the more negative a value is,
+the more likely that it was zero, or very small,
+in the original "raw" count matrix.
+For most distances and hypotheses, these values
+are probably not very important, or even negligible.
+Given this, it is probably quite reasonable to do one of the following:
+
+(1) Set to zero all values less than zero.
+If `X` is your matrix, you can accomplish this with
+`X[X < 0.0] <- 0.0`
+(2) Add a pseudocount prior to data transformation.
+This often curbs or prevents the presence of zeroes
+in the table of transformed values.
+Some people don't like this approach for their dataset,
+and they may or may not be correct.
+It is up to you to decide for your data.
+See [Discussion on Issue 445](https://github.com/joey711/phyloseq/issues/445).
+
+Please also note that taxa entries that are all negative after transformation,
+or equivalently are very small or almost always zero,
+should probably be filtered from your data
+prior to analysis.
+There are many different reasons for this.
+
+
+## I get an error regarding geometric mean
+
+See my [SO post on alternative geometric mean functions in R](http://stackoverflow.com/a/25555105/935950)
+There are several examples for alternative calculations of geometric mean,
+and some of these might solve the problem of having an error.
+
+See also the discussion on [Issue 445](https://github.com/joey711/phyloseq/issues/445)
+regarding geometric means.
+
+Alternative library size estimators may be appropriate for your data,
+and it remains your responsibility
+to determine if any specific approach is valid.
+
+Mike Love (a developer for DESeq2), suggested the following consideration:
+
+"On the other hand, very sparse count datasets,
+with large counts for single samples per row and the rest at 0,
+don't fit well to the negative binomial distribution.
+Here, the VST or simply shifted log, `log(count+k)`,
+might be a safer choice than the `rlog`.
+A way that I test for sparsity is looking at a plot
+of the row sum of counts and the proportion of count
+which is in a single sample."
+
+
+## Pseudocounts are not appropriate for my data, because...
+
+See [Discussion on Issue 445](https://github.com/joey711/phyloseq/issues/445).
+
+Also, think carefully about what you mean here.
+I suspect this statement could be more accurately stated as,
+*pseudocounts are not appropriate for my experiment, data, and the analysis step I was about to perform*.
+Your position in this case is thus based on a combination
+of how the data appears to behave,
+and your knowledge of how pseudocounts would affect
+the analysis you were going to use.
+Consider the following.
+
+- Is there an alternative analysis method?
+- Is the method you were about to use really that sensitive to adding a pseucocount?
+- Is a pseudocount really needed, or were you copying/pasting this step
+to an analysis script that you found somewhere?
+
+
+## I’m scared that the Negative Binomial doesn’t fit my data well
+
+See [Discussion on Issue 445](https://github.com/joey711/phyloseq/issues/445).
+
+
+## I don’t know how to test for differential abundance now. How do I do that?
+
+There is now lots of documentation on this topic.
+
+For starters, please see
+[the phyloseq vignette devoted to this topic](http://bioconductor.org/packages/release/bioc/vignettes/phyloseq/inst/doc/phyloseq-mixture-models.html).
+
+A Google search for "phyloseq differential abundance"
+will also likely turn up a number of useful, related resources.
+
+
+# - I need help analyzing my data. It has the following study design...
+
+I am currently a biostatistician at Second Genome, Inc.,
+which offers complete
+[end-to-end microbiome experiment solutions](http://www.secondgenome.com/solutions)
+as a fee-for-service.
+In some cases Second Genome clients already have their microbiome data
+and want to make use of our team of trained microbiome analysts
+to get the most information from their expeirment.
+I recommend contacting one of the sales associates at the link above.
+
+My day-to-day efforts are in understanding the role of the microbiome
+in human health and disease.
+If you're looking for a collaboration on your microbiome
+data collection or data analysis,
+please contact [Second Genome Solutions](http://www.secondgenome.com/solutions).
diff --git a/vignettes/phyloseq-analysis.Rmd b/vignettes/phyloseq-analysis.Rmd
new file mode 100644
index 0000000..345cf08
--- /dev/null
+++ b/vignettes/phyloseq-analysis.Rmd
@@ -0,0 +1,506 @@
+---
+title: "Vignette for phyloseq: Analysis of high-throughput microbiome census data"
+output:
+ BiocStyle::html_document:
+ fig_height: 7
+ fig_width: 10
+ toc: yes
+ toc_depth: 2
+ number_sections: true
+---
+<!--
+%% \VignetteEngine{knitr::rmarkdown}
+%% \VignetteIndexEntry{analysis vignette}
+-->
+
+`r library("knitr")`
+`r opts_chunk$set(cache=FALSE, fig.width=9, message=FALSE, warning=FALSE)`
+
+Paul J. McMurdie and Susan Holmes
+
+<mcmurdie at stanford.edu>
+
+[phyloseq Home Page](http://joey711.github.io/phyloseq/)
+
+If you find phyloseq and/or its tutorials useful, please acknowledge and cite phyloseq in your publications:
+
+**phyloseq: An R package for reproducible interactive analysis and graphics of microbiome census data** (2013) PLoS ONE 8(4):e61217
+http://dx.plos.org/10.1371/journal.pone.0061217
+
+# Other resources
+The phyloseq project also has a number of supporting online resources, most of which can by found at [the phyloseq home page](http://joey711.github.com/phyloseq/), or from the phyloseq stable release [page on Bioconductor](http://bioconductor.org/packages/release/bioc/html/phyloseq.html).
+
+To post feature requests or ask for help, try [the phyloseq Issue Tracker](https://github.com/joey711/phyloseq/issues).
+
+
+# Summary
+
+The analysis of microbiological communities brings many challenges: the integration of many different types of data with methods from ecology, genetics, phylogenetics, network analysis, visualization and testing. The data itself may originate from widely different sources, such as the microbiomes of humans, soils, surface and ocean waters, wastewater treatment plants, industrial facilities, and so on; and as a result, these varied sample types may have very different forms and scales of [...]
+
+
+# About this vignette
+
+A separate vignette is included within the phyloseq-package that describes the basics of importing pre-clustered phylogenetic sequencing data, data filtering, as well as some transformations and some additional details about the package and installation. A quick way to load it is:
+
+```{r dontrun-basics-vignette, eval=FALSE}
+vignette("phyloseq-basics")
+```
+
+By contrast, this vignette is intended to provide functional examples of the analysis tools and wrappers included in phyloseq. All necessary code for performing the analysis and producing graphics will be included with its description, and the focus will be on the use of example data that is included and documented within the phyloseq-package.
+
+Let's start by loading the `phyloseq-package:
+
+```{r load-packages, message=FALSE, warning=FALSE}
+library("phyloseq")
+library("ggplot2")
+```
+
+And because we will show examples of custom modifications to ggplot2 plots,
+we also loaded ggplot2 as well.
+Here I'll set as default my favorite ggplot2 theme.
+These are completely optional, and modifiable.
+
+```{r ggplot2-themes}
+theme_set(theme_bw())
+```
+
+
+
+# Data
+
+## Interface with the microbio.me/qiime server
+See the [microbio_me_qiime tutorial](http://joey711.github.io/phyloseq/download-microbio.me.html) for more details and examples downloading and importing into phyloseq/R directly from this public database.
+
+## Included Data
+To facilitate testing and exploration of tools in phyloseq, this package includes example data from published studies. Many of the examples in this vignette use either the [Global Patterns](http://www.pnas.org/content/early/2010/06/02/1000080107) or `enterotype` datasets as source data. The [Global Patterns](http://www.pnas.org/content/early/2010/06/02/1000080107) data was described in a [2011 article in PNAS](http://www.pnas.org/content/early/2010/06/02/1000080107)([Caporaso 2011](http: [...]
+
+Because this data is included in the package, the examples can easily be run on your own computer using the code shown in this vignette. The data is loaded into memory using the `data` command. Let's start by loading the [Global Patterns](http://www.pnas.org/content/early/2010/06/02/1000080107) data.
+
+```{r}
+data(GlobalPatterns)
+```
+
+Later on we will use an additional categorical designation --- human versus non-human associated samples --- that was not in the original dataset. Now is a good time to add it as an explicit variable of the `sample_data`, and because we don't want to type long words over and over, we'll choose a shorter name for this modified version of [Global Patterns](http://www.pnas.org/content/early/2010/06/02/1000080107), call it `GP`, and also remove a handful of taxa that are not present in any o [...]
+
+```{r }
+# prune OTUs that are not present in at least one sample
+GP <- prune_taxa(taxa_sums(GlobalPatterns) > 0, GlobalPatterns)
+# Define a human-associated versus non-human categorical variable:
+human <- get_variable(GP, "SampleType") %in% c("Feces", "Mock", "Skin", "Tongue")
+# Add new human variable to sample data:
+sample_data(GP)$human <- factor(human)
+```
+
+
+# Simple exploratory graphics
+
+## Easy Richness Estimates
+
+For further details, see the [plot_richness tutorial](http://joey711.github.io/phyloseq/plot_richness-examples.html)
+
+We can easily create a complex graphic that compares the richness estimates of samples from different environment types in the [Global Patterns](http://www.pnas.org/content/early/2010/06/02/1000080107) dataset, using the `plot_richness` function. Note that it is important to use raw (untrimmed) OTU-clustered data when performing richness estimates, as they can be highly dependent on the number of singletons in a sample.
+
+```{r richness_estimates0, fig.width=13, fig.height=7}
+alpha_meas = c("Observed", "Chao1", "ACE", "Shannon", "Simpson", "InvSimpson")
+(p <- plot_richness(GP, "human", "SampleType", measures=alpha_meas))
+```
+
+Add a ggplot2 box plot layer to the previous plot
+
+```{r richness_estimates, fig.width=13,height=7}
+p + geom_boxplot(data=p$data, aes(x=human, y=value, color=NULL), alpha=0.1)
+```
+Alpha diversity estimators for samples in the *Global Patterns* dataset. Each panel shows a different type of estimator. Individual color-shaded points and brackets represent the richness estimate and the theoretical standard error range associated with that estimate, respectively. The colors group the sample-sources into "types". Within each panel, the samples are further organized into human-associated (`TRUE`) or not (`FALSE`), and a boxplot is overlayed on top of this for the two gro [...]
+
+
+## Exploratory tree plots
+
+For further details, see the [plot_tree tutorial](http://joey711.github.io/phyloseq/plot_tree-examples.html)
+
+phyloseq also contains a method for easily plotting an annotated phylogenetic tree with information regarding the sample in which a particular taxa was observed, and optionally the number of individuals that were observed.
+
+For the sake of creating a readable tree, let's subset the data to just the [Chlamydiae](http://en.wikipedia.org/wiki/Chlamydiae) phylum, which consists of obligate intracellular pathogens and is present in only a subset of environments in this dataset.
+
+```{r }
+GP.chl <- subset_taxa(GP, Phylum=="Chlamydiae")
+```
+
+And now we will create the tree graphic form this subset of [Global Patterns](http://www.pnas.org/content/early/2010/06/02/1000080107), shading by the "`SampleType" variable, which indicates the environment category from which the microbiome samples originated. The following command also takes the option of labeling the number of individuals observed in each sample (if at all) of each taxa. The symbols are slightly enlarged as the number of individuals increases.
+
+```{r GP-chl-tree, fig.width=15, fig.height=7, message=FALSE, warning=FALSE}
+plot_tree(GP.chl, color="SampleType", shape="Family", label.tips="Genus", size="Abundance")
+```
+Phylogenetic tree representation of the Chlamydiae species in the microbiome samples of the "Global Patterns" dataset([Caporaso 2011](http://www.pnas.org/content/early/2010/06/02/1000080107)).
+
+
+## Exploratory bar plots
+
+For further details, see the [plot_bar tutorial](http://joey711.github.io/phyloseq/plot_bar-examples.html)
+
+In the following example we use the included "enterotype" dataset ([Arumugam 2011](http://www.nature.com/nature/journal/v473/n7346/full/nature09944.html)).
+
+```{r}
+data(enterotype)
+```
+We start with a simple rank-abundance barplot, using the cumulative fractional abundance of each OTU in the dataset. In the enterotype dataset, the available published data are simplified as sample-wise fractional occurrences, rather than counts of individuals\footnote{Unfortunate, as this means we lose information about the total number of reads and associated confidences, ability to do more sophisticated richness estimates, etc. For example, knowing that we observed 1 sequence read of [...]
+
+```{r EntAbundPlot, fig.height=6, fig.width=8}
+par(mar = c(10, 4, 4, 2) + 0.1) # make more room on bottom margin
+N <- 30
+barplot(sort(taxa_sums(enterotype), TRUE)[1:N]/nsamples(enterotype), las=2)
+```
+An example exploratory barplot using base `R graphics and the `taxa_sums and `nsamples functions.
+
+Note that this first barplot is clipped at the `r N`th OTU. This was chosen because `ntaxa(enterotype) =``r ntaxa(enterotype)` OTUs would not be legible on the plot. As you can see, the relative abundances have decreased dramatically by the 10th-ranked OTU.
+
+So what are these OTUs? In the `enterotype` dataset, only a single taxonomic rank type is present:
+```{r}
+rank_names(enterotype)
+```
+This means the OTUs in this dataset have been grouped at the level of genera, and no other taxonomic grouping/transformation is possible without additional information (like might be present in a phylogenetic tree, or with further taxonomic classification analysis).
+
+We need to know which taxonomic rank classifiers, if any, we have available to specify in the second barplot function in this example, `plot_bar(). We have already observed how quickly the abundance decreases with rank, so wo we will subset the enterotype dataset to the most abundant `N taxa in order to make the barplot legible on this page.
+
+```{r}
+TopNOTUs <- names(sort(taxa_sums(enterotype), TRUE)[1:10])
+ent10 <- prune_taxa(TopNOTUs, enterotype)
+print(ent10)
+```
+
+Note also that there are `r nsamples(ent10)` samples in this dataset, and so a remaining challenge is to consolidate these samples into meaningful groups. A good place to look is the available sample variables, which in most cases will carry more "meaning" than the sample names alone.
+
+```{r}
+sample_variables(ent10)
+```
+
+The parameters to `plot_bar` in the following code-chunk were chosen after various trials. We suggest that you also try different parameter settings while you're exploring different features of the data. In addition to the variables names of `sample_data`, the `plot_bar()` function recognizes the names of taxonomic ranks (if present). See the help documentation and further details in the examples and on the wiki page. In this example we have also elected to organize data by "facets" (sep [...]
+
+```{r entbarplot0, fig.height=6, fig.width=10}
+plot_bar(ent10, "SeqTech", fill="Enterotype", facet_grid=~Genus)
+```
+
+An example exploratory bar plot using the `plot_bar` function. In this case we have faceted the data (abundance values) according to the genera of each OTU. The subset of OTUs that have not been assigned to a specific genus are in the `NA` panel. Within each facet, the data is further separated by sequencing technology, and each OTU is shaded according to the enterotype of the sample it form which it came. Abundance values from different samples and OTUs but having the same variables ma [...]
+
+Figure summarizes quantitatively the increased abundances of Bacteroides and Prevotella in the Enterotypes 1 and 2, respectively. Interestingly, a large relative abundance of Blautia was observed for Enterotype 3, but only from 454-pyrosequencing data sets, not the Illumina or Sanger datasets. This suggests the increased Blautia might actually be an artifact. Similarly, Prevotella appears to be one of the most abundant genera in the Illumina-sequenced samples among Enterotype 3, but this [...]
+
+
+
+
+# Exploratory analysis and graphics
+
+## Exploratory Heat Map
+
+For further details, see the [plot_heatmap tutorial](http://joey711.github.io/phyloseq/plot_heatmap-examples.html)
+
+As the number of taxa in a dataset gets very large, the ability to effectively display all of the elements of the data becomes compromised, and a heatmap representation is no exception. It can also be time-consuming to render. To address both these issues, we show an example in which we have subsetted the Global Patterns dataset to a manageable portion, in this case, the Crenarchaeota phylum.
+
+```{r GPheatmap}
+data("GlobalPatterns")
+gpac <- subset_taxa(GlobalPatterns, Phylum=="Crenarchaeota")
+(p <- plot_heatmap(gpac, "NMDS", "bray", "SampleType", "Family"))
+```
+
+What if you wanted to change the axis labels?
+
+```{r GPheatmap-rename-axes}
+p$scales$scales[[1]]$name <- "My X-Axis"
+p$scales$scales[[2]]$name <- "My Y-Axis"
+print(p)
+```
+
+Note that it is possible to order the sample/species indices
+by any of the ordination methods supported in the `ordinate` function;
+and also that the color scheme can be modified with additional arguments.
+
+
+Heat map representation of the Crenarchaeota phylum abundance pattern across different sample types in the Global Patterns dataset.
+
+
+## Microbiome Network Representation
+
+For further details, see the [plot_network tutorial](http://joey711.github.io/phyloseq/plot_network-examples.html)
+
+Continuing with the `enterotype` dataset, here are some examples for creating a custom network representation of the relationship between microbiome samples in an experiment. This relies heavily on the igraph and ggplot2 packages to create a network display of the "connectedness" of samples according to some user-provided ecological similarity. By default, points represent microbiom samples, and are determined using an algorithm that optimizes the clarity of the display of network "edges [...]
+
+In this example, the default dissimilarity index was used (Jaccard, co-occurrence), with a maximum distance of `0.3` required to create an edge. Any function that can operate on phyloseq-objects and return a sample-wise distance can be provided as the `dist.fun` argument, or a character string of the name of the distance function already supported in phyloseq. Other distances may result in very different clustering, and this is a choice that should be understood and not taken too lightly [...]
+
+Interestingly, at this level of analysis and parameter-settings the two major sub-graphs appear to be best explained by the sequencing technology and not the subject enterotype, suggesting that the choice of sequencing technology has a major effect on the microbial community one can observe. This seems to differ somewhat with the inferences described in the "enterotype" article ([Arumugam 2011](http://www.nature.com/nature/journal/v473/n7346/full/nature09944.html)). However, there could [...]
+
+```{r plot_sample_network, fig.width=11, fig.height=7, message=FALSE, warning=FALSE}
+data(enterotype)
+plot_net(enterotype, maxdist=0.4, color="SeqTech", shape="Enterotype")
+```
+
+Network representation of the relationship between microbiome samples in the "Enterotype" dataset ([Arumugam 2011](http://www.nature.com/nature/journal/v473/n7346/full/nature09944.html)).
+
+
+
+
+## Ordination Methods
+
+For further details, see the [plot_ordination tutorial](http://joey711.github.io/phyloseq/plot_ordination-examples.html)
+
+Ordination methods can be a useful tool for exploring complex phylogenetic sequencing data, particularly when the hypothesized structure of the data is poorly defined (or there isn't a hypothesis). The phyloseq package provides some useful tools for performing ordinations and plotting their results, via the `ordinate() and `plot_ordination() functions, respectively. Although there are many options and methods supported, a first-step will probably look something like the following:
+
+```{r eval=FALSE}
+my.physeq <- import("Biom", BIOMfilename="myBiomFile.biom")
+my.ord <- ordinate(my.physeq)
+plot_ordination(my.physeq, my.ord, color="myFavoriteVarible")
+```
+
+It is probably a good idea to read the documentation for these two functions, as they also provide links to related functions and additional examples you can try immediately on your own machine.
+```{r help-import, eval=FALSE}
+help(import)
+help(ordinate)
+help(distance)
+help(plot_ordination)
+```
+
+
+### Principal Coordinates Analysis (PCoA)
+
+We take as our first example, a reproduction of Figure 5 from the "Global Patterns" article([Caporaso 2011](http://www.pnas.org/content/early/2010/06/02/1000080107)). The authors show a 3-dimensional representation of the first three axes of a Principal Coordinates Analysis (PCoA; This is also sometimes referred to as "Multi-Dimensional Scaling", or "MDS") performed on the unweighted-UniFrac distance using all of the available sequences (their approach included both 5' and 3' sequences). [...]
+
+The following reproduces the unweighted UniFrac distance calculation on the full dataset. Note that this calculation can take a long time because of the large number of OTUs. Parallelization is recommended for large datasets, typically if they are as large as [Global Patterns](http://www.pnas.org/content/early/2010/06/02/1000080107), or larger. For details on parallelization, see the details section and examples in the `UniFrac()` documentation, and also the page dedicated to the topic o [...]
+
+http://joey711.github.io/phyloseq-demo/unifrac.html
+
+```{r GP-data-load}
+data(GlobalPatterns)
+```
+```{r, eval=FALSE}
+GPUF <- UniFrac(GlobalPatterns)
+```
+
+Load the pre-computed distance matrix, `GPUF`
+
+```{r load-precomputed-UF}
+load(system.file("doc", "Unweighted_UniFrac.RData", package="phyloseq"))
+```
+
+Calculate the PCoA on this distance matrix, `GPUF`.
+
+```{r}
+GloPa.pcoa = ordinate(GlobalPatterns, method="PCoA", distance=GPUF)
+```
+
+Before we look at the results, let's first investigate how much of the total distance structure we will capture in the first few axes. We can do this graphically with a "scree plot", an ordered barplot of the relative fraction of the total eigenvalues associated with each axis.
+
+```{r PCoAScree, fig.width=6, fig.height=4}
+plot_scree(GloPa.pcoa, "Scree plot for Global Patterns, UniFrac/PCoA")
+```
+
+Scree plot of the PCoA used to create Figure 5 from the "Global Patterns" article([Caporaso 2011](http://www.pnas.org/content/early/2010/06/02/1000080107)). The first three axes represent `r round(100*sum(GloPa.pcoa$values$Relative_eig[1:3]))`% of the total variation in the distances. Interestingly, the fourth axis represents another `r round(100*(GloPa.pcoa$values$Relative_eig[4]))`%, and so may warrant exploration as well. A scree plot is an important tool for any ordination method, as [...]
+
+Next, we will reproduce Figure 5 from the "Global Patterns" article([Caporaso 2011](http://www.pnas.org/content/early/2010/06/02/1000080107)), but separating the three axes into 2 plots using `plot_ordination()`.
+
+```{r GPfig5ax1213}
+(p12 <- plot_ordination(GlobalPatterns, GloPa.pcoa, "samples", color="SampleType") +
+ geom_point(size=5) + geom_path() + scale_colour_hue(guide = FALSE) )
+(p13 <- plot_ordination(GlobalPatterns, GloPa.pcoa, "samples", axes=c(1, 3),
+ color="SampleType") + geom_line() + geom_point(size=5) )
+```
+A reproduction in phyloseq / R of the main panel of Figure 5 from the "Global Patterns" article([Caporaso 2011](http://www.pnas.org/content/early/2010/06/02/1000080107)), on two plots. The horizontal axis represents the first axis in the PCoA ordination, while the top and bottom vertical axes represent the second and third axes, respectively. Different points represent different samples within the dataset, and are shaded according to the environment category to which they belong. The col [...]
+
+
+### non-metric Multi-Dimensional Scaling (NMDS)
+We repeat the previous example, but instead using non-metric multidimensional scaling (NMDS) limited to just two dimensions. This approach limits the amount of residual distance "not shown" in the first two (or three) axes, but forefeits some mathematical properties and does not always converge within the specified number of axes.
+
+```{r GP_UF_NMDS0}
+# (Re)load UniFrac distance matrix and GlobalPatterns data
+data(GlobalPatterns)
+load(system.file("doc", "Unweighted_UniFrac.RData", package="phyloseq"))
+# perform NMDS, set to 2 axes
+GP.NMDS <- ordinate(GlobalPatterns, "NMDS", GPUF)
+(p <- plot_ordination(GlobalPatterns, GP.NMDS, "samples", color="SampleType") +
+ geom_line() + geom_point(size=5) )
+```
+An example exploratory ordination using non-metric multidimensional scaling (NMDS) on the unweighted UniFrac distance between samples of the "Global Patterns" dataset. Sample points are shaded by environment type, and connected by a line if they belong to the same type. Compare with Figure 5 from the "Global Patterns" article([Caporaso 2011](http://www.pnas.org/content/early/2010/06/02/1000080107)).
+
+The figure nicely shows the relative dissimilarities between microbial communities from different habitats. However, it fails to indicate what was different between the communities. For an ordination method that provides information on the taxa that explain differences between samples (or groups of samples), we use Correspondence Analysis.
+
+### Correspondence Analysis (CA)
+
+In the following section we will show continue our exploration of the "GlobalPatterns" dataset using various features of an ordination method called Correspondence Analysis. We give special emphasis to exploratory interpretations using the biplot, because it provides additional information that is not available from PCoA or NMDS.
+
+Let's start by performing a Correspondence Analysis and investigating the scree plot. Both interestingly and challengingly, the scree plot suggests that the [Global Patterns](http://www.pnas.org/content/early/2010/06/02/1000080107) abundance data is quite high-dimensional, with the first two CA axes accounting for not quite 17% of the total (chi-square) variability. Note the absence of a steep decline in eigenvalue fraction as axis number increases. Each additional axis represents only m [...]
+
+First, let's severely subset the number of species for the sake of run-time.\footnote{This is for illustration purposes only, do not repeat unless you are very sure you have a good reason for doing this.
+
+```{r GPCAscree0, fig=FALSE}
+data(GlobalPatterns)
+# Take a subset of the GP dataset, top 200 species
+topsp <- names(sort(taxa_sums(GlobalPatterns), TRUE)[1:200])
+GP <- prune_taxa(topsp, GlobalPatterns)
+# Subset further to top 5 phyla, among the top 200 OTUs.
+top5ph <- sort(tapply(taxa_sums(GP), tax_table(GP)[, "Phylum"], sum), decreasing=TRUE)[1:5]
+GP <- subset_taxa(GP, Phylum %in% names(top5ph))
+# Re-add human variable to sample data:
+sample_data(GP)$human <- factor(human)
+```
+
+Now perform the correspondence analysis.
+
+```{r GPCAscree, fig.width=8, fig.height=5}
+# Now perform a unconstrained correspondence analysis
+gpca <- ordinate(GP, "CCA")
+# Scree plot
+plot_scree(gpca, "Scree Plot for Global Patterns Correspondence Analysis")
+```
+The correspondence analysis (CA) scree plot of the "Global Patterns" dataset.
+
+Now let's investigate how the samples behave on the first few CA axes.
+
+```{r GPCA1234}
+(p12 <- plot_ordination(GP, gpca, "samples", color="SampleType") +
+ geom_line() + geom_point(size=5) )
+(p34 <- plot_ordination(GP, gpca, "samples", axes=c(3, 4), color="SampleType") +
+ geom_line() + geom_point(size=5) )
+```
+First 4 axes of Correspondence Analysis (CA) of the "Global Patterns" dataset ([Caporaso 2011](http://www.pnas.org/content/early/2010/06/02/1000080107)).
+
+A clear feature of these plots is that the feces and mock communities cluster tightly together, far away from all other samples on the first axis (CA1). The skin and tongue samples separate similarly, but on the second axis. Taken together, it appears that the first two axes are best explained by the separation of human-associated "environments" from the other non-human environments in the dataset, with a secondary separation of tongue and skin samples from feces.
+
+We will now investigate further this top-level structure of the data, using an additional feature of correspondence analysis that allows us to compare the relative contributions of individual taxa on the same graphical space: the "biplot". However, because we just displayed the position of samples in the ordination and there are often many thousands of OTUs, we will focus on creating an interpretable plot of the OTUs. For creating graphics that combine the two plots, try the `"biplot"` o [...]
+
+```{r GPCAspecplot0}
+p1 <- plot_ordination(GP, gpca, "species", color="Phylum")
+(p1 <- ggplot(p1$data, p1$mapping) + geom_point(size=5, alpha=0.5) +
+ facet_wrap(~Phylum) + scale_colour_hue(guide = FALSE) )
+```
+
+Species plot of the "Global Patterns" correspondence analysis first two axes, with each phylum on a different panel ("facet"). Only the most abundant 5 phyla among the most abundant 200 taxa (cumulative, all samples) are included. Arbitrary reduction, for computational efficiency of example.
+
+Let's try drawing the figure again, only this time summarizing the species points as a 2D density estimate, without any individual points.
+
+```{r GPCAspecplotTopo0}
+(p3 <- ggplot(p1$data, p1$mapping) + geom_density2d() +
+ facet_wrap(~Phylum) + scale_colour_hue(guide = FALSE) )
+```
+Redrawn figure, which is severely overplotted, as a 2-dimensional species-density topographic map, faceted in the same way.
+
+These figures reveal some useful patterns and interesting outliers, but what if we want a complete summary of how each phylum is represented along each axis? The following code is a way to show this using boxplots, while still avoiding the occlusion problem (points layered on top of each other), and also conveying some useful information about the pattern of taxa that contribute to the separation of human-associated samples from the other sample types. It re-uses the data that was stored [...]
+
+```{r GPCAjitter0}
+library("reshape2")
+# Melt the species-data.frame, DF, to facet each CA axis separately
+mdf <- melt(p1$data[, c("CA1", "CA2", "Phylum", "Family", "Genus")],
+ id=c("Phylum", "Family", "Genus") )
+# Select some special outliers for labelling
+LF <- subset(mdf, variable=="CA2" & value < -1.0)
+# build plot: boxplot summaries of each CA-axis, with labels
+p <- ggplot(mdf, aes(Phylum, value, color=Phylum)) +
+ geom_boxplot() +
+ facet_wrap(~variable, 2) +
+ scale_colour_hue(guide = FALSE) +
+ theme_bw() +
+ theme( axis.text.x = element_text(angle = -90, vjust = 0.5) )
+# Add the text label layer, and render ggplot graphic
+(p <- p + geom_text(data=subset(LF, !is.na(Family)),
+ mapping = aes(Phylum, value+0.1, color=Phylum, label=Family),
+ vjust=0,
+ size=2))
+```
+Boxplot of taxa (species in this case) of the "Global Patterns" CA first two axes, shaded/separated by phylum. Through this approach it is much easier to see particular species that cluster unusually relative to the rest of their phylum, for example the Bacteroidetes species (Prevotellaceae family) that is positioned most in the negative CA2 direction toward the Tongue/Skin samples.
+
+One way to relate some of the high-level patterns we observed from correspondence analysis is to directly visualize the abundances in an organized, quantitative way, to see if this does in fact support / explain the human/environment microbiome differences. Here is an example using the `plot_bar` function described in an earlier section.
+
+```{r GPtaxaplot0}
+plot_bar(GP, x="human", fill="SampleType", facet_grid= ~ Phylum)
+```
+Phylum-level comparison of relative abundance of taxa in samples that are from human microbiomes (or not).
+
+In this figure we've used the `threshold` parameter to omit all but phyla accounting for the top 90% of phyla in any one sample. Some patterns emerging from this display appear to be: (1) Cyanobacteria, Actinobacteria appear under-represented in human samples; (2) conversely, Firmicutes appear over-represented in human samples; (3) Acidobacteria, Verrucomicrobia appear over-represented in the fecal samples; (4) the only Crenarchaeota were observed in the Mock sample, which is not really [...]
+
+
+### Double Principle Coordinate Analysis (DPCoA)
+
+Here is a quick example illustrating the use of Double Principal Coordinate Analysis (DPCoA~\cite{Pavoine2004523), using the using the `ordinate()` function in phyloseq, as well as the "biplot" option for `plot_ordination(). For a description that includes an applied example using the "enterotype" dataset and comparison with UniFrac/PCoA, see Fukuyama et al~\cite{fukuyama2012com.
+
+```{r GPdpcoa01}
+GP.dpcoa <- ordinate(GP, "DPCoA")
+pdpcoa <- plot_ordination(GP, GP.dpcoa, type="biplot",
+ color="SampleType", shape="Phylum")
+shape.fac <- pdpcoa$data[, deparse(pdpcoa$mapping$shape)]
+man.shapes <- c(19, 21:25)
+names(man.shapes) <- c("Samples", levels(shape.fac)[levels(shape.fac)!="Samples"])
+p2dpcoa <- pdpcoa + scale_shape_manual(values=man.shapes)
+```
+
+A biplot representation of a Double Principal Coordinate Analysis (DPCoA), on a simplified version of the "Global Patterns" dataset with only the most abundant 200 OTUs included.
+
+```{r GPdpcoa02}
+# Show just Samples or just Taxa
+plot_ordination(GP, GP.dpcoa, type="taxa", shape="Phylum")
+plot_ordination(GP, GP.dpcoa, type="samples", color="SampleType")
+# Split
+plot_ordination(GP, GP.dpcoa, type="split",
+ color="SampleType", shape="Phylum") +
+ ggplot2::scale_colour_discrete()
+```
+
+
+## Distance Methods
+
+### distance(): Central Distance Function
+
+Many comparisons of microbiome samples, including the graphical model and the PCoA analysis, require a calculation for the relative dissimilarity/distance between one microbial community and another. The phyloseq-package provides a general "wrapper" function for calculating ecological distance matrices between the samples in an experiment.
+
+`distance()` currently supports 43 method options, as well as user-provided arbitrary methods via an interface to vegan's `designdist()` function. Currrently only sample-wise distances are supported (the `type` argument), but eventually species-wise (OTU-wise) distances will be supported as well. In addition to supporting any of the method options to the three main distance functions of the vegan-package~\cite{veganpkg --- including the 14 distances of the `vegdist()` function and all 24 [...]
+
+The function takes a `phyloseq-class` object and an argument indicating the distance type; and it returns a `dist-class distance matrix.
+
+```{r distancefun}
+data(esophagus)
+distance(esophagus, "bray")
+distance(esophagus, "wunifrac") # weighted UniFrac
+distance(esophagus, "jaccard") # vegdist jaccard
+distance(esophagus, "g") # betadiver method option "g"
+```
+
+
+### UniFrac and weighted UniFrac
+UniFrac is a recently-defined~\cite{Lozupone:2005gn and popular distance metric to summarize the difference between pairs of ecological communities. All UniFrac variants use a phylogenetic tree of the relationship among taxa as central information to calculating the distance between two samples/communities. An unweighted UniFrac distance matrix only considers the presence/absence of taxa, while weighted UniFrac accounts for the relative abundance of taxa as well as their phylogenetic dis [...]
+
+The following is an example calculating the UniFrac distance (both weighted and unweighted) matrix using the "esophagus" example dataset:
+
+```{r eval=FALSE, echo=TRUE}
+data(esophagus)
+distance(esophagus, "wUniFrac")
+distance(esophagus, "uUniFrac")
+```
+
+See the phyloseq demo page about fast parallel UniFrac.
+
+
+## Hierarchical Clustering
+Another potentially useful and popular way to visualize/decompose sample-distance matrices is through hierarchical clustering (e.g. `hclust`). In the following example, we reproduce Figure~4 from the ["Global Patterns" article](http://www.pnas.org/content/early/2010/06/02/1000080107), using the unweighted UniFrac distance and the UPGMA method (`hclust` parameter `method="average"`). Try `help("hclust")` for alternative clustering methods included in standard R.
+
+```{r}
+# (Re)load UniFrac distance matrix and GlobalPatterns data
+data(GlobalPatterns)
+load(system.file("doc", "Unweighted_UniFrac.RData", package="phyloseq"))
+# Manually define color-shading vector based on sample type.
+colorScale <- rainbow(length(levels(get_variable(GlobalPatterns, "SampleType"))))
+cols <- colorScale[get_variable(GlobalPatterns, "SampleType")]
+GP.tip.labels <- as(get_variable(GlobalPatterns, "SampleType"), "character")
+# This is the actual hierarchical clustering call, specifying average-link clustering
+GP.hclust <- hclust(GPUF, method="average")
+plot(GP.hclust, col=cols)
+```
+
+An alternative means of summarizing a distance matrix via hierarchical clustering and plotting as an annotated dendrogram. Compare with Figure 4 from the [Global Patterns](http://www.pnas.org/content/early/2010/06/02/1000080107)). Some differences in Figure~\ref{fig:GPfig4 from the original article might be explained by [Global Patterns](http://www.pnas.org/content/early/2010/06/02/1000080107) in phyloseq being the summed observations from both primer directions (5' and 3'), while in the [...]
+
+
+# Multiple Testing and Differential Abundance
+
+One of our recommended approaches to this problem was described in
+McMurdie and Holmes (2014) [Waste Not, Want Not: Why Rarefying Microbiome Data is Inadmissible](http://dx.plos.org/10.1371/journal.pcbi.1003531).
+PLoS Computational Biology. 10(4):e1003531
+
+Some reproducible demonstrations of this approach are included in
+[the phyloseq extensions repository](http://joey711.github.io/phyloseq-extensions/extensions-index.html), the `phyloseq_to_deseq2` function,
+as well as a separate vignetted dedicated to this topic
+(phyloseq and DESeq2 on Colorectal Cancer Data).
+
+Please make use of these materials for differential abundance testing.
diff --git a/vignettes/phyloseq-basics.Rmd b/vignettes/phyloseq-basics.Rmd
new file mode 100644
index 0000000..a44421e
--- /dev/null
+++ b/vignettes/phyloseq-basics.Rmd
@@ -0,0 +1,600 @@
+---
+title: "Basic storage, access, and manipulation of phylogenetic sequencing data with *phyloseq*"
+output:
+ BiocStyle::html_document:
+ fig_height: 7
+ fig_width: 10
+ toc: yes
+ toc_depth: 2
+ number_sections: true
+---
+<!--
+%% \VignetteEngine{knitr::rmarkdown}
+%% \VignetteIndexEntry{phyloseq basics vignette}
+-->
+
+`r library("knitr")`
+`r opts_chunk$set(cache=FALSE, fig.width=9, message=FALSE, warning=FALSE)`
+
+Paul J. McMurdie and Susan Holmes
+
+<mcmurdie at stanford.edu>
+
+[phyloseq Home Page](http://joey711.github.io/phyloseq/)
+
+If you find phyloseq and/or its tutorials useful, please acknowledge and cite phyloseq in your publications:
+
+**phyloseq: An R package for reproducible interactive analysis and graphics of microbiome census data** (2013) PLoS ONE 8(4):e61217
+http://dx.plos.org/10.1371/journal.pone.0061217
+
+## Other resources
+The phyloseq project also has a number of supporting online resources, most of which can by found at [the phyloseq home page](http://joey711.github.com/phyloseq/), or from the phyloseq stable release [page on Bioconductor](http://bioconductor.org/packages/release/bioc/html/phyloseq.html).
+
+To post feature requests or ask for help, try [the phyloseq Issue Tracker](https://github.com/joey711/phyloseq/issues).
+
+
+# Introduction
+
+The analysis of microbiological communities brings many challenges: the integration of many different types of data with methods from ecology, genetics, phylogenetics, network analysis, visualization and testing. The data itself may originate from widely different sources, such as the microbiomes of humans, soils, surface and ocean waters, wastewater treatment plants, industrial facilities, and so on; and as a result, these varied sample types may have very different forms and scales of [...]
+
+
+# About this vignette
+
+## Typesetting Legend <a id="sec:typeset-legend"></a>
+
+- **bold** - Bold is used for emphasis.
+- *italics* - Italics are used for package names, and special words, phrases.
+- `code font` - The font for code, usually courrier-like,
+but depends on the theme.
+- `myFun()` - Code font word with `()` attached at the right-end,
+is a function name.
+- [Hyperlink](#sec:typeset-legend) - Hyperlinks are
+clickable text that will jump to sections and external pages.
+
+## Other links and tutorials
+
+An overview of phyloseq's intended functionality, goals, and design is provided
+in the following free and open access article:
+
+McMurdie and Holmes (2013). [phyloseq: An R package for reproducible interactive analysis and graphics of microbiome census data](http://dx.plos.org/10.1371/journal.pone.0061217). PLoS ONE e61217.
+
+The most updated examples are posted in our online tutorials from
+[the phyloseq home page](http://joey711.github.com/phyloseq)
+
+A separate vignette describes analysis tools included in phyloseq along with various examples using included example data. A quick way to load it is:
+
+```{r, eval=FALSE}
+vignette("phyloseq_analysis")
+```
+
+By contrast, this vignette is intended to provide functional examples of the basic data import and manipulation infrastructure included in phyloseq. This includes example code for importing OTU-clustered data from different clustering pipelines, as well as performing clear and reproducible filtering tasks that can be altered later and checked for robustness. The motivation for including tools like this in phyloseq is to save time, and also to build-in a structure that requires consistenc [...]
+
+
+# phyloseq classes <a id="sec:app-classes"></a>
+
+The class structure in the *phyloseq* package follows the inheritance diagram shown in the figure below.
+Currently, *phyloseq* uses 4 core data classes.
+They are
+(1) the OTU abundance table (`otu_table`),
+a table of sample data (`sample_data`);
+(2) a table of taxonomic descriptors (`taxonomyTable`); and
+(3) a phylogenetic tree (`"phylo"`-class, [ape package](http://cran.r-project.org/web/packages/ape/).
+
+The `otu_table` class can be considered the central data type,
+as it directly represents the number and type of sequences observed in each sample.
+`otu_table` extends the numeric matrix class in the `R` base,
+and has a few additonal feature slots.
+The most important of these feature slots is the `taxa_are_rows` slot,
+which holds a single logical that indicates whether the table is oriented
+with taxa as rows (as in the *genefilter* package in [Bioconductor](#cite:bioconductor)
+or with taxa as columns (as in *vegan* and *picante* packages).
+In *phyloseq* methods, as well as its extensions of methods in other packages,
+the `taxa_are_rows` value is checked to ensure proper orientation of the `otu_table`.
+A *phyloseq* user is only required to specify the `otu_table` orientation during initialization, following which all handling is internal.
+
+The `sample_data` class directly inherits `R`'s `data.frame` class, and thus effectively stores both categorical and numerical data about each sample. The orientation of a `data.frame` in this context requires that samples/trials are rows, and variables are columns (consistent with *vegan* and other packages). The `taxonomyTable` class directly inherits the `matrix` class, and is oriented such that rows are taxa/OTUs and columns are taxonomic levels (e.g. *Phylum*).
+
+The phyloseq-class can be considered an "experiment-level class" and should contain two or more of the previously-described core data classes. We assume that *phyloseq* users will be interested in analyses that utilize their abundance counts derived from the phylogenetic sequencing data, and so the `phyloseq()` constructor will stop with an error if the arguments do not include an `otu_table`. There are a number of common methods that require either an `otu_table` and `sample_data` combi [...]
+
+![phyloseq class structure](phyloseq_classes_7.png)
+ Classes and inheritance in the *phyloseq* package. The class name and its slots are shown with red- or blue-shaded text, respectively. Coercibility is indicated graphically by arrows with the coercion function shown. Lines without arrows indicate that the more complex class (``phyloseq") contains a slot with the associated data class as its components.
+
+
+# Load *phyloseq* and import data <a id="sec:load"></a>
+
+Now let's get started by loading phyloseq, and describing some methods for importing data.
+
+## Load *phyloseq*
+
+To use *phyloseq* in a new R session, it will have to be loaded. This can be done in your package manager, or at the command line using the `library()` command:
+```{r load-packages, message=FALSE, warning=FALSE}
+library("phyloseq")
+```
+
+## Import data
+
+An important feature of *phyloseq* are methods
+for importing phylogenetic sequencing data
+from common taxonomic clustering pipelines.
+These methods take file pathnames as input,
+read and parse those files,
+and return a single object that contains all of the data.
+
+Some additional background details are provided below.
+The best reproducible examples on importing data with phyloseq
+can be found on the official data import tutorial page:
+
+http://joey711.github.com/phyloseq/import-data
+
+
+## Import from biom-format <a id="sec:biom"></a>
+
+New versions of QIIME (see below) produce a file in *version 2* of the
+[biom file format](http://biom-format.org/),
+which is a specialized definition of the HDF5 format.
+
+The phyloseq package provides the `import_biom()` function,
+which can import both
+*Version 1* (JSON) and
+*Version 2* (HDF5)
+of the BIOM file format.
+
+The *phyloseq* package fully supports
+both taxa and sample observations of the biom format standard,
+and works with the BIOM files output from QIIME, RDP, MG-RAST, etc.
+
+
+## Import from QIIME (Modern)<a id="sec:qiimeimport"></a>
+
+The default output from modern versions of QIIME
+is a BIOM-format file (among others).
+This is suppored in phyloseq.
+
+### Sample data from QIIME
+
+Sometimes inaccurately referred to as *metadata*,
+additional observations on samples provided as *mapping file* to QIIME
+have not typically been output in the BIOM files,
+**even though BIOM format supports it**.
+This failure to support the full capability of the BIOM format
+means that you'll have to provide sample observations as a separate file.
+There are many ways to do this, but the QIIME sample map is supported.
+
+### Input
+
+Two QIIME output files (`.biom`, `.tre`)
+are recognized by the `import_biom()` function.
+One QIIME input file (sample map, tab-delimited),
+is recognized by the `import_qiime_sample_data()` function.
+
+---
+ Input File(s) | phyloseq function | Output
+ --- | --- | ---
+ `.biom`, `.tre` | `import_biom()` | phyloseq object with OTU table, taxonomy table, and tree (if provided)
+ `.tre` | `read_tree()` | `phylo` object, representing phylogenetic tree.
+ `map.txt` | `import_qiime_sample_data()` | A `sample_data` object
+---
+
+The objects created by each of the import functions above
+should be merged using `merge_phyloseq` to create one coordinated, self-consistent object.
+
+### Output
+
+- **Before Merging** - Before merging with `merge_phyloseq`, the output from these import activities is the three separate objects listed in the previous table.
+- **After Merging** - After merging you have a single self-consistent phyloseq object
+that contains an OTU table, taxonomy table, sample-data, and a phylogenetic tree.
+
+### QIIME Example Tutorial
+
+QIIME's "Moving Pictures" example tutorial output is a little too large
+to include within the phyloseq package
+(and thus is not directly included in this vignette).
+However, the phyloseq home page includes
+a full reproducible example of the import procedure described above:
+
+**Link HERE**
+
+For reference, or if you want to try yourself,
+the following is the relative paths within the QIIME tutorial directory
+for each of the files you will need.
+
+- BIOM file, originally at:
+`r "moving_pictures_tutorial-1.9.0/illumina/precomputed-output/otus/otu_table_mc2_w_tax_no_pynast_failures.biom"`
+- Tree file, originally at:
+`r "moving_pictures_tutorial-1.9.0/illumina/precomputed-output/otus/rep_set.tre"`
+- Map File, originally at:
+`r "moving_pictures_tutorial-1.9.0/illumina/map.tsv"`
+
+
+## Import from QIIME Legacy<a id="sec:qiimeimportleg"></a>
+
+[QIIME](#cite:QIIME) is a free, open-source OTU clustering and analysis pipeline written for Unix (mostly Linux). It is distributed in a number of different forms (including a pre-installed virtual machine). See [the QIIME home page](http://qiime.org/) for details.
+
+### Input
+
+One QIIME input file (sample map), and two QIIME output files (`otu_table.txt`, `.tre`) are recognized by the `import_qiime()` function. Only one of the three input files is required to run, although an `"otu_table.txt"` file is required if `import_qiime()` is to return a complete experiment object.
+
+In practice, you will have to find the relevant QIIME files among a number of other files created by the QIIME pipeline. A screenshot of the directory structure created during a typical QIIME run is shown in [the QIIME Directory Figure](#fig:qiimedirectory).
+
+
+<a id="fig:qiimedirectory"></a>
+![QIIME directory structure](import_qiime_directory_structure.jpg)
+ A typical QIIME output directory. The two output files suitable for import by *phyloseq* are highlighted. A third file describing the samples, their barcodes and covariates, is created by the user and required as *input* to QIIME. It is a good idea to import this file, as it can be converted directly to a `sample_data` object and can be extremely useful for certain analyses.
+
+
+### Output
+
+The class of the object returned by `import_qiime()` depends upon which filenames are provided. The most comprehensive class is chosen automatically, based on the input files listed as arguments. At least one argument needs to be provided.
+
+
+
+## Import from mothur <a id="sec:mothurimport"></a>
+
+The open-source, platform-independent, locally-installed software package, [mothur](#cite:Schloss:2009do), can also process barcoded amplicon sequences and perform OTU-clustering. It is extensively documented on [the mothur wiki](http://www.mothur.org/wiki/)
+
+### Input
+
+Currently, there are three different files produced by the *mothur* package (Ver `1.22+`) that can be imported by *phyloseq*. At minimum, a user must supply a "`.list`" file, and at least one of the following two files: `.groups` or `.tree`. The group file is produced by *mothur*'s `make.group()` function. Details can be found at [its wiki page](http://www.mothur.org/wiki/Make.group). The tree file is a phylogenetic tree calculated by *mothur*.
+
+### Output
+
+The output from `import_mothur()` depends on which file types are provided. If all three file types are provided, an instance of the phyloseq-class is returned that contains both an OTU abundance table and its associated phylogenetic tree.
+
+
+## Import from PyroTagger
+
+PyroTagger is an OTU-clustering pipeline for barcoded 16S rRNA amplicon sequences, served and maintained by the Department of Energy's (DOE's) Joint Genome Institute (JGI). It can be used through a straightforward web interface at [the PyroTagger home page](http://pyrotagger.jgi-psf.org/)
+
+PyroTagger takes as input the untrimmed sequence (`.fasta`) and sequence-quality (`.qual`) files, as well as a sample mapping file that contains the bar code sequence for each sample and its name. It uses a 97\% identity threshold for defining OTU clusters (approximately species-level of taxonomic distinction), and provides no options for specifying otherwise. It does allow users to modify the threshold setting for low-quality bases.
+
+### Input
+
+PyroTagger returns a single excel spreadsheet file (`.xls`) containing both abundance and taxonomy data, as well as some associated confidence information related to each taxonomic assignment. This spreadsheet also reports on potential chimeric sequences. This single output file is sufficient for `import_RDP_tab()`, provided the file has been converted to a tab-delimited plain-text format. Any spreadsheet application should suffice. No other changes should be made to the `.xls` file.
+
+### Output
+
+`import_RDP_tab()` returns an instance of the phyloseq-class that contains the OTU abundance table and taxonomy table. To my knowledge, PyroTagger does not calculate a tree of the representative sequences from each OTU cluster, nor a distance object, so analyses like `tip_glom()` and `UniFrac` are not applicable.
+
+
+## Import from RDP pipeline
+
+The Ribosomal Database Project ([RDP](http://rdp.cme.msu.edu/)) provides a web-based barcoded 16S rRNA amplicon sequence processing pipeline called the [RDP Pyrosequencing Pipeline](http://pyro.cme.msu.edu/). A user must run all three of the "Data Processing" steps sequentially through the web interface in order to acquire the output from Complete Linkage Clustering, the approach to OTU clustering used by the RDP Pipeline. Note that this import function assumes that the sequence names in [...]
+
+### Input
+
+The output from the Complete Linkage Clustering, `.clust`, is the only input to the RDP pipeline importer:
+
+```{r, eval=FALSE}
+myOTU1 <- import_RDP_cluster("path/to/my/filename.clust")
+```
+
+### Output
+
+This importer returns an `otu_table` object.
+
+### Expected Naming Convention
+
+The RDP cluster pipeline (specifically, the output of the complete linkage clustering step) has no formal documentation for the ".clust" file structure or its apparent sequence naming convention.
+
+The cluster file itself contains the names of all sequences contained in the input alignment. If the upstream barcode and aligment processing steps are also done with the RDP pipeline, then the sequence names follow a predictable naming convention wherein each sequence is named by its sample and sequence ID, separated by a `"_"` as delimiter:
+
+`sampleName_sequenceIDnumber`
+
+This import function assumes that the sequence names in the cluster file follow this convention, and that the sample name does not contain any `"_"`. It is unlikely to work if this is not the case. It is likely to work if you used the upstream steps in the RDP pipeline to process your raw (barcoded, untrimmed) fasta/fastq data.
+
+
+
+## Example Data (included)
+
+There are multiple example data sets included in *phyloseq*. Many are from published investigations and include documentation with a summary and references, as well as some example code representing some aspect of analysis available in *phyloseq*. In the package index, go to the names beginning with "data-" to see the documentation of currently available example datasets.
+
+To load example data into the working environment, use the `data()` command:
+
+```{r, eval=FALSE}
+data(GlobalPatterns)
+data(esophagus)
+data(enterotype)
+data(soilrep)
+```
+
+Similarly, entering `?enterotype` will reveal the documentation for the so-called "enterotype" dataset. For details examples, see [the Example Data tutorial](http://joey711.github.io/phyloseq/Example-Data.html)
+
+## phyloseq Object Summaries
+
+In small font, the following is the summary of the `GlobalPatterns` dataset that prints to the terminal. These summaries are consistent among all `phyloseq-class` objects. Although the components of `GlobalPatterns` have many thousands of elements, the command-line returns only a short summary of each component. This encourages you to check that an object is still what you expect, without needing to let thousands of elements scroll across the terminal. In the cases in which you do want t [...]
+
+
+```{r}
+data(GlobalPatterns)
+GlobalPatterns
+```
+
+
+## Convert raw data to phyloseq components
+
+Suppose you have already imported raw data from an experiment into `R`, and their indices are labeled correctly. How do you get *phyloseq* to recognize these tables as the appropriate class of data? And further combine them together? Table [Table of Component Constructor Functions](#table:build) lists key functions for converting these core data formats into specific component data objects recognized by *phyloseq*. These will also
+
+ Table of component constructor functions for building component data objects <a id="table:build"></a>
+
+---
+ Function | Input Class | Output Description
+ --- | --- | ---
+ `otu_table` | numeric matrix | `otu_table` object storing OTU abundance
+ `otu_table` | data.frame | `otu_table` object storing OTU abundance
+ `sample_data` | data.frame | `sample_data` object storing sample variables
+ `tax_table` | character matrix | `taxonomyTable` object storing taxonomic identities
+ `tax_table` | data.frame | `taxonomyTable` object storing taxonomic identities
+ `read_tree` | file path char | phylo-class tree, read from file
+ `read.table` | table file path | A matrix or data.frame (Std `R` core function)
+---
+
+ phyloseq constructors: functions for building/merging *phyloseq* objects.
+
+---
+Function | Input Class | Output Description
+--- | --- | ---
+`phyloseq` | Two or more component objects | phyloseq-class, *experiment-level* object
+`merge_phyloseq`| Two or more component or phyloseq-class objects | Combined instance of phyloseq-class
+---
+
+The following example illustrates using the constructor methods for component data tables.
+
+```{r, eval=FALSE}
+otu1 <- otu_table(raw_abundance_matrix, taxa_are_rows=FALSE)
+sam1 <- sample_data(raw_sample_data.frame)
+tax1 <- tax_table(raw_taxonomy_matrix)
+tre1 <- read_tree(my_tree_file)
+```
+
+## phyloseq() function: building complex phyloseq objects
+
+Once you've converted the data tables to their appropriate class, combining them into one object requires only one additional function call, `phyloseq()`:
+```{r, eval=FALSE}
+ex1b <- phyloseq(my_otu_table, my_sample_data, my_taxonomyTable, my_tree)
+```
+
+You do not need to have all four data types in the example above in order to combine them into one validity-checked experiment-level phyloseq-class object. The `phyloseq()` method will detect which component data classes are present, and build accordingly. Downstream analysis methods will access the required components using *phyloseq*'s accessors, and throw an error if something is missing. For most downstream methods you will only need to supply the combined, phyloseq-class object (the [...]
+```{r, eval=FALSE}
+ex1c <- phyloseq(my_otu_table, my_sample_data)
+```
+
+Whenever an instance of the phyloseq-class is created by *phyloseq* --- for example, when we use the `import_qiime()` function to import data, or combine manually imported tables using `phyloseq()` --- the row and column indices representing taxa or samples are internally checked/trimmed for compatibility, such that all component data describe exactly (and only) the same OTUs and samples.
+
+## Merge
+
+The phyloseq project includes support for two complete different categories of merging.
+
+ - Merging the OTUs or samples in a phyloseq object, based upon a taxonomic or sample variable: `merge_samples()`, `merge_taxa()`
+ - Merging two or more data objects that come from the same experiment, so that their data becomes part of the same phyloseq object: `merge_phyloseq()`
+
+For further details, see the reproducible online tutorial at:
+
+http://joey711.github.com/phyloseq/merge
+
+
+
+# Accessor functions <a id="sec:accessors"></a>
+
+Once you have a phyloseq object available, many accessor functions are available to query aspects of the data set. The function name and its purpose are summarized in [the Accessor Functions Table](#table:access).
+
+ Accessor functions for *phyloseq* objects.
+
+<a id="table:access"></a>
+
+---
+Function | Returns
+--- | ---
+ `[` | Standard extraction operator. Works on `otu_table`, `sample_data`, and `taxonomyTable`
+ `access` | General slot accessor function for phyloseq-package
+ `get_taxa` | Abundance values of all taxa in sample `i'
+ `get_sample` | Abundance values of taxa `i' for all samples
+ `get_taxa_unique` | A unique vector of the observed taxa at a particular taxonomic rank
+ `get_variable` | An individual sample variable vector/factor
+ `nsamples` | Get the number of samples described by an object
+ `ntaxa` | Get the number of OTUs (taxa) described by an object
+ `otu_table` | Build or access otu_table objects
+ `rank_names` | Get the names of the available taxonomic ranks
+ `sample_data` | Build or access `sample_data` objects
+ `sample_names` | The names of all samples
+ `taxa_names` | The names of all taxa
+ `sample_sums` | The sum of the abundance values of each sample
+ `sample_variables` | The names of sample variables
+ `taxa_sums` | The sum of the abundance values of each taxa
+ `taxa_are_rows` | `TRUE` if taxa are row indices in `otu_table`
+ `tax_table` | A taxonomy table
+ `phy_tree` | Access the tree contained in a phyloseq object
+---
+
+
+
+# Trimming, subsetting, filtering phyloseq data <a id="sec:trim"></a>
+
+## Trimming: prune_taxa()
+Trimming high-throughput phylogenetic sequencing data can be useful, or even necessary, for certain types of analyses. However, it is important that the original data always be available for reference and reproducibility; and that the methods used for trimming be transparent to others, so they can perform the same trimming or filtering steps on the same or related data. To facilitate this, *phyloseq* contains many ways to trim/filter the data from a phylogenetic sequencing project. Becau [...]
+
+In general, most trimming should be accomplished using the S4 methods `prune_taxa()` or `prune_samples()`.
+
+## Simple filtering example
+
+```{r echo=FALSE}
+topN <- 20
+```
+
+For example, lets make a new object that only holds the most abundant `r topN` taxa in the experiment. To accomplish this, we will use the `prune_taxa()` function.
+
+```{r}
+data(GlobalPatterns)
+most_abundant_taxa <- sort(taxa_sums(GlobalPatterns), TRUE)[1:topN]
+ex2 <- prune_taxa(names(most_abundant_taxa), GlobalPatterns)
+```
+
+Now we can ask the question, "what taxonomic Family are these OTUs?" (Subsetting still returns a `taxonomyTable` object, which is summarized. We will need to convert to a vector)
+
+```{r}
+topFamilies <- tax_table(ex2)[, "Family"]
+as(topFamilies, "vector")
+```
+
+## Arbitrarily complex abundance filtering
+
+The previous example was a relatively simple filtering in which we kept only the most abundant `r topN` in the whole experiment. But what if we wanted to keep the most abundant `r topN` taxa of each sample? And of those, keep only the taxa that are also found in at least one-third of our samples? What if we wanted to keep only those taxa that met some across-sample criteria?
+
+### genefilter_sample(): Filter by Within-Sample Criteria
+
+For this more complicated filtering *phyloseq* contains a function, `genefilter_sample`, that takes as an argument a *phyloseq* object, as well as a list of one or more filtering functions that will be applied to each sample in the abundance matrix (`otu_table`), as well as an integer argument, `A`, that specifies for how many samples the filtering function must return `TRUE` for a particular taxa to avoid removal from the object. A supporting function `filterfun_sample` is also included [...]
+
+Here is an example on a completely fabricated `otu_table` called `testOTU`.
+```{r, eval=FALSE}
+testOTU <- otu_table(matrix(sample(1:50, 25, replace=TRUE), 5, 5), taxa_are_rows=FALSE)
+f1<- filterfun_sample(topk(2))
+wh1 <- genefilter_sample(testOTU, f1, A=2)
+wh2 <- c(T, T, T, F, F)
+prune_taxa(wh1, testOTU)
+prune_taxa(wh2, testOTU)
+```
+
+Here is a second example using the included dataset, `GlobalPatterns`. The most abundant taxa are kept only if they are in the most abundant 10\% of taxa in at least half of the samples in dataset `GlobalPatterns`. Note that it is not necessary to subset `GlobalPatterns` in order to do this filtering. The S4 method `prune_taxa` subsets each of the relavent component objects, and returns the complex object back.
+
+```{r}
+data(GlobalPatterns)
+f1<- filterfun_sample(topp(0.1))
+wh1 <- genefilter_sample(GlobalPatterns, f1, A=(1/2*nsamples(GlobalPatterns)))
+sum(wh1)
+ex2 <- prune_taxa(wh1, GlobalPatterns)
+```
+
+```{r}
+print(ex2)
+```
+
+If instead of the most abundant fraction of taxa, you are interested in the most abundant fraction of individuals (aka sequences, observations), then the `topf` function is appropriate. For steep rank-abundance curves, `topf` will seem to be much more conservative (trim more taxa) because it is based on the cumulative sum of relative abundance. It does not guarantee that a certain number or fraction of total taxa (richness) will be retained.
+
+```{r, eval=FALSE}
+data(GlobalPatterns)
+f1<- filterfun_sample(topf(0.9))
+wh1 <- genefilter_sample(GlobalPatterns, f1, A=(1/3*nsamples(GlobalPatterns)))
+sum(wh1)
+prune_taxa(wh1, GlobalPatterns)
+```
+
+### filter_taxa(): Filter by Across-Sample Criteria
+
+The `filter_taxa` function is directly analogous to the `genefilter` function for microarray filtering, but is used for filtering OTUs from phyloseq objects. It applies an arbitrary set of functions -- as a function list, for instance, created by `genefilter::filterfun` -- as across-sample criteria, one OTU at a time. It can be thought of as an extension of the genefilter-package (from the Bioconductor repository) for phyloseq objects. It takes as input a phyloseq object, and returns a l [...]
+
+Inspect the following example. Note that the functions `genefilter` and `kOverA` are from the genefilter package.
+
+```{r}
+data("enterotype")
+library("genefilter")
+flist<- filterfun(kOverA(5, 2e-05))
+ent.logi <- filter_taxa(enterotype, flist)
+ent.trim <- filter_taxa(enterotype, flist, TRUE)
+identical(ent.trim, prune_taxa(ent.logi, enterotype))
+identical(sum(ent.logi), ntaxa(ent.trim))
+filter_taxa(enterotype, flist, TRUE)
+```
+
+## subset_samples(): Subset by Sample Variables
+
+It is possible to subset the samples in a *phyloseq* object based on the sample variables using the `subset_samples()` function. For example to subset `GlobalPatterns` such that only certain environments are retained, the following line is needed (the related tables are subsetted automatically as well):
+
+```{r}
+ex3 <- subset_samples(GlobalPatterns, SampleType%in%c("Freshwater", "Ocean", "Freshwater (creek)"))
+ex3
+```
+
+For this example only a categorical variable is shown, but in principle a continuous variable could be specified and a logical expression provided just as for the `subset` function. In fact, because `sample_data` component objects are an extension of the data.frame class, they can also be subsetted with the `subset` function:
+
+```{r}
+subset(sample_data(GlobalPatterns), SampleType%in%c("Freshwater", "Ocean", "Freshwater (creek)"))
+```
+
+## subset_taxa(): subset by taxonomic categories
+
+It is possible to subset by specific taxonomic category using the `subset_taxa()` function. For example, if we wanted to subset `GlobalPatterns` so that it only contains data regarding the phylum *Firmicutes*:
+
+```{r}
+ex4 <- subset_taxa(GlobalPatterns, Phylum=="Firmicutes")
+ex4
+```
+
+## random subsample abundance data
+
+Can also randomly subset, for example a random subset of 100 taxa from the full dataset.
+
+```{r}
+randomSpecies100 <- sample(taxa_names(GlobalPatterns), 100, replace=FALSE)
+ex5 <- prune_taxa(randomSpecies100, GlobalPatterns)
+```
+
+
+# Transform abundance data<a id="sec:transform"></a>
+
+Sample-wise transformation can be achieved with the `transform_sample_counts()` function. It requires two arguments, (1) the *phyloseq* object that you want to transform, and the function that you want to use to perform the transformation. Any arbitrary function can be provided as the second argument, as long as it returns a numeric vector with the same length as its input. In the following trivial example, we create a second object, `ex2`, that has been "transformed" by the identity fun [...]
+
+```{r, eval=FALSE}
+data(GlobalPatterns)
+ex2 <- transform_sample_counts(GlobalPatterns, I)
+```
+
+For certain kinds of analyis we may want to transform the abundance data. For example, for RDA we want to transform abundance counts to within-sample ranks, and to further include a threshold beyond which all taxa receive the same rank value. The ranking for each sample is performed independently, so that the rank of a particular taxa within a particular sample is not influenced by that sample's total quantity of sequencing relative to the other samples in the project.
+
+The following example shows how to perform such a thresholded-rank transformation of the abundance table in the complex *phyloseq* object `GlobalPatterns` with an arbitrary threshold of 500.
+
+```{r}
+ex4<- transform_sample_counts(GlobalPatterns, threshrankfun(500))
+```
+
+
+# Phylogenetic smoothing <a id="sec:glom"></a>
+
+## tax_glom()
+
+Suppose we are skeptical about the importance of OTU-level distinctions in our dataset. For this scenario, *phyloseq* includes a taxonomic-agglommeration method,`tax_glom()`, which merges taxa of the same taxonomic category for a user-specified taxonomic level. In the following code, we merge all taxa of the same Genus, and store that new object as `ex6`.
+
+```{r, eval=FALSE}
+ex6 <- tax_glom(GlobalPatterns, taxlevel="Genus")
+```
+
+## tip_glom()
+
+Similarly, our original example object (`GlobalPatterns`) also contains a phlyogenetic tree corresponding to each OTU, which we could also use as a means to merge taxa in our dataset that are closely related. In this case, we specify a threshold patristic distance. Taxa more closely related than this threshold are merged. This is especially useful when a dataset has many taxa that lack a taxonomic assignment at the level you want to investigate, a problem when using `tax_glom()`. Note th [...]
+
+```{r, eval=FALSE}
+ex7 <- tip_glom(GlobalPatterns, speciationMinLength = 0.05)
+```
+
+Command output not provided here to save time during compilation of the vignette. The user is encouraged to try this out on your dataset, or even this example, if interested. It may take a while to run on the full, untrimmed data.
+
+
+# Installation
+
+## Installation
+
+Please check [the phyloseq installation tutorial](http://joey711.github.com/phyloseq/install) for help with installation. This is likely to be the first place news and updated information about installation will be posted, as well. Also check out the rest of [the phyloseq homepage on GitHub](http://joey711.github.io/phyloseq/), as this is the best place to post issues, bug reports, feature requests, contribute code, etc.
+
+## Installing Parallel Backend
+
+For running parallel implementation of functions/methods in *phyloseq* (e.g. `UniFrac(GlobalPatterns, parallel=TRUE)`), you will need also to install a function for registering a parallel "backend". Only one working parallel backend is needed, but there are several options, and the best one will depend on the details of your particular system. The "doParallel" package is a good place to start. Any one of the following lines from an `R` session will install a backend package.
+
+```{r, eval=FALSE}
+install.packages("doParallel")
+install.packages("doMC")
+install.packages("doSNOW")
+install.packages("doMPI")
+```
+
+
+# References
+
+<a id="cite:bioconductor"></a>
+Robert C Gentleman, Vincent J. Carey, Douglas M. Bates, et al. **Bioconductor: Open software development for computational biology and bioinformatics.** *Genome Biology* 5:R80, 2004.
+
+<a id="cite:QIIME"></a>
+J Gregory Caporaso, Justin Kuczynski, Jesse Stombaugh, Kyle Bittinger, Frederic D Bushman **QIIME allows analysis of high-throughput community sequencing data.** *Nature Methods* 7(5):335-336, 2010.
+
+<a id="cite:Schloss:2009do"></a>
+P D Schloss, S L Westcott, T Ryabin, J R Hall, M Hartmann, et al. **Introducing mothur: Open-Source, Platform-Independent, Community-Supported Software for Describing and Comparing Microbial Communities.** *Applied and Environmental Microbiology* 75(23):7537-7541, 2009.
+
+<a id="cite:RDP"></a>
+J R Cole, Q Wang, E Cardenas, J Fish, B Chai et al. **The Ribosomal Database Project: improved alignments and new tools for rRNA analysis.** *Nucleic Acids Research* 37(Database issue):D141-5, 2009.
diff --git a/vignettes/phyloseq-mixture-models.Rmd b/vignettes/phyloseq-mixture-models.Rmd
new file mode 100644
index 0000000..3e3b03e
--- /dev/null
+++ b/vignettes/phyloseq-mixture-models.Rmd
@@ -0,0 +1,191 @@
+---
+title: "Example using Negative Binomial in Microbiome Differential Abundance Testing"
+output:
+ BiocStyle::html_document:
+ fig_height: 7
+ fig_width: 10
+ toc: yes
+ toc_depth: 2
+ number_sections: true
+---
+<!--
+%% \VignetteEngine{knitr::rmarkdown}
+%% \VignetteIndexEntry{phyloseq and DESeq2 on Colorectal Cancer Data}
+-->
+
+`r library("knitr")`
+`r opts_chunk$set(cache=FALSE, fig.width=9, message=FALSE, warning=FALSE)`
+
+Paul J. McMurdie and Susan Holmes
+
+<mcmurdie at stanford.edu>
+
+[phyloseq Home Page](http://joey711.github.io/phyloseq/)
+
+If you find phyloseq and/or its tutorials useful, please acknowledge and cite phyloseq in your publications:
+
+**phyloseq: An R package for reproducible interactive analysis and graphics of microbiome census data** (2013) PLoS ONE 8(4):e61217
+http://dx.plos.org/10.1371/journal.pone.0061217
+
+# Other resources
+The phyloseq project also has a number of supporting online resources, most of which can by found at [the phyloseq home page](http://joey711.github.com/phyloseq/), or from the phyloseq stable release [page on Bioconductor](http://bioconductor.org/packages/release/bioc/html/phyloseq.html).
+
+To post feature requests or ask for help, try [the phyloseq Issue Tracker](https://github.com/joey711/phyloseq/issues).
+
+
+# The experimental data used in this example
+
+In this example I use the publicly available data from a study on colorectal cancer:
+
+[Genomic analysis identifies association of Fusobacterium with colorectal carcinoma](http://genome.cshlp.org/content/22/2/292.long).
+Kostic, A. D., Gevers, D., Pedamallu, C. S., Michaud, M., Duke, F., Earl, A. M., et al. (2012). *Genome research*, 22(2), 292-298.
+
+As a side-note, this work was published ahead of print in [Genome Research](http://genome.cshlp.org/) alongside a highly-related article from a separate group of researchers (long-live reproducible observations!): [Fusobacterium nucleatum infection is prevalent in human colorectal carcinoma](http://genome.cshlp.org/content/22/2/299.long). In case you are interested. For the purposes of example, however, we will stick to the data from the former study, with data available at the [microbio [...]
+
+Data source, from methods section in article:
+
+> The 16S gene data set consists of 454 FLX Titanium sequences spanning the V3 to V5 variable regions obtained for 190 samples (95 pairs). Detailed protocols used for 16S amplification and se- quencing are available on the HMP Data Analysis and Coordination Center website (http://www.hmpdacc.org/tools_protocols/tools_ protocols.php).
+
+Study ID: `1457`
+
+Project Name: `Kostic_colorectal_cancer_fusobacterium`
+
+Study Abstract:
+
+> The tumor microenvironment of colorectal carcinoma is a complex community of genomically altered cancer cells, nonneoplastic cells, and a diverse collection of microorganisms. Each of these components may contribute to carcino genesis; however, the role of the microbiota is the least well understood. We have characterized the composition of the microbiota in colorectal carcinoma using whole genome sequences from nine tumor/normal pairs. Fusobacterium sequences were enriched in carcinom [...]
+
+# Import data with phyloseq, convert to DESeq2
+
+Start by loading phyloseq.
+
+```{r load-phyloseq, message=FALSE, warning=FALSE}
+library("phyloseq"); packageVersion("phyloseq")
+```
+
+Defined file path, and import the published OTU count data into R.
+
+```{r filepath}
+filepath = system.file("extdata", "study_1457_split_library_seqs_and_mapping.zip", package="phyloseq")
+kostic = microbio_me_qiime(filepath)
+```
+
+Here I had to use a relative file path so that this example works on all systems that have phyloseq installed. In practice, your file path will look like this (if you've downloaded the data ahead of time):
+
+```{r example-path-local, eval=FALSE}
+filepath = "~/Downloads/study_1457_split_library_seqs_and_mapping.zip"
+kostic = microbio_me_qiime(filepath)
+```
+
+Or like this (if you're accessing data directly from the microbio.me/qiime server directly):
+
+```{r example-path-remote, eval=FALSE}
+kostic = microbio_me_qiime(1457)
+```
+
+
+# Convert to DESeq2's DESeqDataSet class
+
+In this example I'm using the major sample covariate, `DIAGNOSIS`, as the study design factor. The focus of this study was to compare the microbiomes of pairs of healthy and cancerous tissues, so this makes sense. Your study could have a more complex or nested design, and you should think carefully about the study design formula, because this is critical to the test results and their meaning. You might even need to define a new factor if none of the variables in your current table approp [...]
+
+Here is the summary of the data variable `kostic` that we are about to use, as well as the first few entries of the `DIAGNOSIS` factor.
+```{r show-variables}
+kostic
+head(sample_data(kostic)$DIAGNOSIS, 10)
+```
+
+# DESeq2 conversion and call
+
+First load DESeq2.
+
+```{r deseq2, message=FALSE, warning=FALSE}
+library("DESeq2"); packageVersion("DESeq2")
+```
+
+The following two lines actually do all the complicated DESeq2 work. The function `phyloseq_to_deseq2` converts your phyloseq-format microbiome data into a `DESeqDataSet` with dispersions estimated, using the experimental design formula, also shown (the `~DIAGNOSIS` term). The `DESeq` function does the rest of the testing, in this case with default testing framework, but you can actually use alternatives.
+
+First remove the 5 samples that had no `DIAGNOSIS` attribute assigned.
+These introduce a spurious third design class
+that is actually a rare artifact in the dataset.
+Also remove samples with less than `500` reads (counts).
+Note that this kind of data cleanup
+is useful, necessary, and should be well-documented
+because it can also be dangerous to alter or omit data
+without clear documentation.
+In this case I actually explored the data first,
+and am omitting some of the details
+(and explanatory plots) here for clarity.
+
+```{r rm-bad-samples}
+kostic <- subset_samples(kostic, DIAGNOSIS != "None")
+kostic <- prune_samples(sample_sums(kostic) > 500, kostic)
+kostic
+```
+
+
+```{r run-deseq2}
+diagdds = phyloseq_to_deseq2(kostic, ~ DIAGNOSIS)
+# calculate geometric means prior to estimate size factors
+gm_mean = function(x, na.rm=TRUE){
+ exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x))
+}
+geoMeans = apply(counts(diagdds), 1, gm_mean)
+diagdds = estimateSizeFactors(diagdds, geoMeans = geoMeans)
+diagdds = DESeq(diagdds, fitType="local")
+```
+Note: The default multiple-inference correction is Benjamini-Hochberg, and occurs within the `DESeq` function.
+
+
+# Investigate test results table
+
+The following `results` function call creates a table of the results of the tests. Very fast. The hard work was already stored with the rest of the DESeq2-related data in our latest version of the `diagdds` object (see above). I then order by the adjusted p-value, removing the entries with an `NA` value. The rest of this example is just formatting the results table with taxonomic information for nice(ish) display in the HTML output.
+
+```{r grab-results-process-table}
+res = results(diagdds)
+res = res[order(res$padj, na.last=NA), ]
+alpha = 0.01
+sigtab = res[(res$padj < alpha), ]
+sigtab = cbind(as(sigtab, "data.frame"), as(tax_table(kostic)[rownames(sigtab), ], "matrix"))
+head(sigtab)
+```
+
+Let's look at just the OTUs that were significantly enriched in the carcinoma tissue. First, cleaning up the table a little for legibility.
+
+```{r table-prelim}
+posigtab = sigtab[sigtab[, "log2FoldChange"] > 0, ]
+posigtab = posigtab[, c("baseMean", "log2FoldChange", "lfcSE", "padj", "Phylum", "Class", "Family", "Genus")]
+```
+```{r make-markdown-table, echo=FALSE, results='asis'}
+# Make a markdown table
+posigtab = data.frame(OTU=rownames(posigtab), posigtab)
+cat(paste(colnames(posigtab), collapse=" | "), fill=TRUE)
+cat(paste(rep("---", times=ncol(posigtab)), collapse=" | "), fill=TRUE)
+dummy = apply(posigtab, 1, function(x){
+ cat(paste(x, collapse=" | "), fill=TRUE)
+})
+```
+
+As expected from the original study abstract and title, a *Fusobacterium* OTU was among the most-significantly differentially abundant between the cancerous and healthy samples.
+
+
+# Plot Results
+
+Here is a bar plot showing the log2-fold-change, showing Genus and Phylum. Uses some ggplot2 commands.
+
+```{r bar-plot}
+library("ggplot2")
+theme_set(theme_bw())
+sigtabgen = subset(sigtab, !is.na(Genus))
+# Phylum order
+x = tapply(sigtabgen$log2FoldChange, sigtabgen$Phylum, function(x) max(x))
+x = sort(x, TRUE)
+sigtabgen$Phylum = factor(as.character(sigtabgen$Phylum), levels=names(x))
+# Genus order
+x = tapply(sigtabgen$log2FoldChange, sigtabgen$Genus, function(x) max(x))
+x = sort(x, TRUE)
+sigtabgen$Genus = factor(as.character(sigtabgen$Genus), levels=names(x))
+ggplot(sigtabgen, aes(y=Genus, x=log2FoldChange, color=Phylum)) +
+ geom_vline(xintercept = 0.0, color = "gray", size = 0.5) +
+ geom_point(size=6) +
+ theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))
+```
+
diff --git a/vignettes/phyloseq_classes_7.png b/vignettes/phyloseq_classes_7.png
new file mode 100644
index 0000000..199c216
Binary files /dev/null and b/vignettes/phyloseq_classes_7.png differ
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/r-bioc-phyloseq.git
More information about the debian-med-commit
mailing list