[med-svn] [r-cran-distory] 03/14: New upstream version 1.4.2
Andreas Tille
tille at debian.org
Sat Sep 23 06:49:14 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository r-cran-distory.
commit 8b18df95df9ba040cc27d62665bf5b03864b0030
Author: Andreas Tille <tille at debian.org>
Date: Sat Sep 23 07:15:07 2017 +0200
New upstream version 1.4.2
---
DESCRIPTION | 15 ++
MD5 | 26 ++
NAMESPACE | 6 +
R/bethe.tree.R | 35 +++
R/bin.multiPhylo.R | 17 ++
R/dist.multiPhylo.R | 183 ++++++++++++++
R/gromov.hyperbolicity.R | 30 +++
R/mcmc.target.seq.R | 135 +++++++++++
R/orthant.boundary.tree.R | 32 +++
R/phylo.diff.R | 79 +++++++
R/position.leverage.R | 33 +++
R/zzz.R | 5 +
debian/changelog | 5 -
debian/compat | 1 -
debian/control | 23 --
debian/copyright | 39 ---
debian/rules | 3 -
debian/source/format | 1 -
debian/watch | 2 -
man/bethe.tree.Rd | 56 +++++
man/bin.multiPhylo.Rd | 60 +++++
man/dist.multiPhylo.Rd | 122 ++++++++++
man/distory-package.Rd | 51 ++++
man/gromov.hyperbolicity.Rd | 75 ++++++
man/mcmc.target.seq.Rd | 136 +++++++++++
man/orthant.boundary.tree.Rd | 73 ++++++
man/phylo.diff.Rd | 107 +++++++++
man/position.leverage.Rd | 71 ++++++
src/newick.cpp | 318 +++++++++++++++++++++++++
src/newick.h | 19 ++
src/phydist_r.cpp | 383 ++++++++++++++++++++++++++++++
src/phylo.h | 100 ++++++++
src/treedist-stripped.cpp | 553 +++++++++++++++++++++++++++++++++++++++++++
src/treedist.h | 32 +++
34 files changed, 2752 insertions(+), 74 deletions(-)
diff --git a/DESCRIPTION b/DESCRIPTION
new file mode 100644
index 0000000..605d16a
--- /dev/null
+++ b/DESCRIPTION
@@ -0,0 +1,15 @@
+Package: distory
+Type: Package
+Title: Distance Between Phylogenetic Histories
+Version: 1.4.2
+Date: 2010-06-07
+Author: John Chakerian and Susan Holmes
+Maintainer: Susan Holmes <susan at stat.stanford.edu>
+Depends: ape (>= 2.3), R (>= 2.6.0)
+Description: Geodesic distance between phylogenetic trees and
+ associated functions.
+License: BSD
+Packaged: 2013-11-12 09:21:46 UTC; ripley
+Repository: CRAN
+Date/Publication: 2013-11-12 10:27:52
+NeedsCompilation: yes
diff --git a/MD5 b/MD5
new file mode 100644
index 0000000..9f37364
--- /dev/null
+++ b/MD5
@@ -0,0 +1,26 @@
+f168f781a67f6e7f7ab62ae647228db3 *DESCRIPTION
+26de99ccfb9ca8a2a213a4818be3b0bf *NAMESPACE
+850d5d746dff4c1e7e623fe7cd35b03a *R/bethe.tree.R
+a2ed747f17ee32505c78a67a2ab03945 *R/bin.multiPhylo.R
+2add15a328c7986ad1baf02a393239b7 *R/dist.multiPhylo.R
+cbd0f76fea9729eca45ed77e486e8513 *R/gromov.hyperbolicity.R
+4606b7200fc81f8a2c8204b170d8e19a *R/mcmc.target.seq.R
+056d8a5ee03dc5dd336ed03dd90357e7 *R/orthant.boundary.tree.R
+626d600d681b8f4cc1b842759126e3ee *R/phylo.diff.R
+b9984556b72472e92c3b298ddcd06df5 *R/position.leverage.R
+b645bf9091131f9aafecc84714c99776 *R/zzz.R
+d364424cfc58647da8ce0e9d709e829e *man/bethe.tree.Rd
+0bc572f4d938febf6e36a51583d17d7b *man/bin.multiPhylo.Rd
+326d3b4a951ffe4454a8266e70e6fd4d *man/dist.multiPhylo.Rd
+ef5f85add48282f9029861e4cd3c75de *man/distory-package.Rd
+65e6debd9d460f4cccabccb2843abd92 *man/gromov.hyperbolicity.Rd
+2adec96ee8efea3a6b2667bd9fdd0374 *man/mcmc.target.seq.Rd
+b81c0afa8545be62ea04abd94e74d081 *man/orthant.boundary.tree.Rd
+d679ae0bc472e781b887c1506f70d7bd *man/phylo.diff.Rd
+0d297ad815b52a39bfe52ca3075b8a32 *man/position.leverage.Rd
+f4d2b55dddfbaca90b778b3e1f45e536 *src/newick.cpp
+76960301ce979b7a7595211c7f3df496 *src/newick.h
+578f0107cf2171675d24aa978bc145e0 *src/phydist_r.cpp
+ab78adf7f6f16d7cbc09a1265f36b4e1 *src/phylo.h
+32800a06d5f51921d0ce2c1ba960bff1 *src/treedist-stripped.cpp
+16f8de5df9c6f8798a3e6f57495ec821 *src/treedist.h
diff --git a/NAMESPACE b/NAMESPACE
new file mode 100644
index 0000000..ce36d59
--- /dev/null
+++ b/NAMESPACE
@@ -0,0 +1,6 @@
+useDynLib("distory")
+
+# Export all names
+exportPattern(".")
+
+import(ape)
diff --git a/R/bethe.tree.R b/R/bethe.tree.R
new file mode 100644
index 0000000..d7ced3e
--- /dev/null
+++ b/R/bethe.tree.R
@@ -0,0 +1,35 @@
+bethe.tree <- function(tips, level.lengths = NULL, outgroup="O", outgroup.dist=1)
+{
+ if(length(tips) != length(unique(tips)))
+ {
+ stop("Not all tips are unique.\n")
+ }
+
+ if(!is.null(level.lengths))
+ {
+ if(length(level.lengths) == 1)
+ {
+ level.lengths = rep(level.lengths, 2)
+ }
+ }
+
+ if(is.null(level.lengths) || length(level.lengths) == 0)
+ {
+ level.lengths = 1
+ }
+
+ groupings <- as.numeric(sapply(1:(length(tips)/2), function(x) rep(x,2)))
+
+ d <- level.lengths[1]
+ level.lengths <- level.lengths[-1]
+ tips <- lapply(split(tips,groupings), function(x) { sprintf("(%s:%d,%s:%d)",x[1],d,x[2],d)} )
+ if(length(tips) == 1)
+ {
+ read.tree(text=sprintf("(%s:%d, %s:%d);", tips[[1]], d, outgroup, outgroup.dist))
+ }
+ else
+ {
+ bethe.tree(tips, level.lengths, outgroup, outgroup.dist)
+ }
+}
+
diff --git a/R/bin.multiPhylo.R b/R/bin.multiPhylo.R
new file mode 100644
index 0000000..804abc3
--- /dev/null
+++ b/R/bin.multiPhylo.R
@@ -0,0 +1,17 @@
+bin.multiPhylo <- function(treelist)
+{
+ edge.dists <- dist.multiPhylo(treelist, method="edgeset")
+ edge.dists <- as.matrix(edge.dists)
+
+ bin.id = 1
+ binning = rep(NA,length(treelist))
+ while(any(is.na(binning)))
+ {
+ # find the next unbinned tree & find all matching trees
+ next.tr = min(which(is.na(binning)))
+ binning[edge.dists[,next.tr] == 0] = bin.id
+ bin.id = bin.id + 1
+ }
+
+ binning
+}
diff --git a/R/dist.multiPhylo.R b/R/dist.multiPhylo.R
new file mode 100644
index 0000000..0abcf3b
--- /dev/null
+++ b/R/dist.multiPhylo.R
@@ -0,0 +1,183 @@
+dist.multiPhylo <- function(x, method="geodesic", force.multi2di = FALSE,
+ outgroup = NULL, convert.multifurcating = FALSE, use.random.resolution =
+ FALSE, scale = NULL, verbose = FALSE)
+{
+ if(length(x) < 2)
+ return(matrix())
+
+ if(class(x) == "multiPhylo") # ideally, we will have this
+ {
+ # run checks if appropriate
+ # separate it out into a vector of strings
+
+ if(!is.null(outgroup))
+ {
+ x <- lapply(x, function(k)
+ {
+ if(class(k) == "phylo")
+ {
+ if(!is.rooted(k))
+ root(k, outgroup, resolve.root = TRUE)
+ else k
+ }
+ else NA
+ })
+ }
+
+
+ if(force.multi2di == TRUE)
+ {
+ x <- lapply(x, function(k)
+ {
+ if(class(k) == "phylo")
+ {
+ multi2di(k,random=use.random.resolution)
+ }
+ else NA
+ })
+ }
+ else if(convert.multifurcating == TRUE) # won't resolve multifurcations at the root
+ {
+ x <- lapply(x, function(k)
+ {
+ if(class(k) == "phylo")
+ {
+ if(!is.binary.tree(k))
+ multi2di(k,random=use.random.resolution)
+ else k
+ }
+ else NA
+ })
+
+ }
+
+ if(!is.null(scale))
+ {
+ if(class(scale) == "phylo")
+ {
+ T <- sum(scale$edge.length)
+ x <- lapply(x, function(k)
+ {
+ if(class(k) == "phylo")
+ {
+ k$edge.length = k$edge.length * (T /
+ sum(k$edge.length))
+ k
+ }
+ else NA
+ })
+ }
+ else if(class(scale) == "numeric")
+ {
+ x <- lapply(x, function(k)
+ {
+ if(class(k) == "phylo")
+ {
+ k$edge.length = k$edge.length * (scale /
+ sum(k$edge.length))
+ k
+ }
+ else NA
+ })
+ }
+ else
+ {
+ stop("Scale parameter not understood.\n")
+ }
+ }
+
+ # do some sanity checks before we start out
+
+ r <- lapply(x, function(k)
+ {
+ if(class(k) == "phylo")
+ {
+ is.rooted(k)
+ }
+ else NA
+ })
+
+ if(!all(as.logical(r), na.rm=TRUE))
+ {
+ stop("Some trees are not rooted. Specify an outgroup to fix this problem. All trees must be rooted.\n")
+ }
+
+ r <- lapply(x, function(k) { if(class(k) == "phylo") is.binary.tree(k) else NA })
+ if(!all(as.logical(r), na.rm=TRUE))
+ {
+ stop("Some trees are not binary. All input trees must be strictly binary.\n")
+ }
+
+ # check to make sure all trees have the same tip labels
+ tips = x[[1]]$tip.label
+ r <- lapply(x, function(k) { if(class(k) == "phylo") setequal(k$tip.label,
+ tips) else NA})
+
+ if(!all(as.logical(r), na.rm=TRUE))
+ {
+ stop("Not all trees have the same tips.\n")
+ }
+
+ # convert our list of class phylo to a list of strings
+ treestrs <- lapply(x, function(k) { if(class(k) == "phylo")
+ write.tree(k) else "" })
+
+ method=tolower(method)
+
+ method.id = pmatch(method, c("edgeset", "geodesic"))
+
+ # call the C interface function and return the value automatically
+ if(method.id == 1)
+ {
+ rv <- .Call("phycpp_bin_trees", treestrs, PACKAGE="distory")
+ }
+ else if(method.id == 2)
+ {
+ rv <- .Call("phycpp_compute_tree_distance_set", treestrs, as.logical(verbose), PACKAGE="distory")
+ }
+ else
+ {
+ stop("Specified method is not valid")
+ }
+
+ as.dist(rv)
+ }
+
+ else if(typeof(x) == "list")
+ {
+ if(class(x[[1]]) == "phylo") # a list of phylo's that for some reason is not classed as multiPhylo
+ {
+ class(x) <- "multiPhylo" # it already is basically a multiPhylo anyways - we'll mark it as such
+ dist.multiPhylo(x, method=method, force.multi2di=force.multi2di, outgroup=outgroup,
+ convert.multifurcating=convert.multifurcating,
+ use.random.resolution=use.random.resolution,
+ scale=scale, verbose=verbose)
+ }
+ else if(class(x[[1]]) == "character") # a list of strings, presuming one tree each, properly terminated
+ {
+ # read with /ape/, run checks, dump back
+ t <- paste(x, sep="", collapse="")
+ k <- read.tree(text=t)
+ dist.multiPhylo(x, method=method, force.multi2di=force.multi2di, outgroup=outgroup,
+ convert.multifurcating=convert.multifurcating,
+ use.random.resolution=use.random.resolution,
+ scale=scale, verbose=verbose)
+ }
+ }
+ else if(class(x) == "character") # this is for one string containing multiple trees
+ {
+ # read with ape and dump back to a vector of strings
+ k <- read.tree(text=x)
+
+ # call this to process it properly
+ dist.multiPhylo(x, method=method, force.multi2di=force.multi2di, outgroup=outgroup,
+ convert.multifurcating=convert.multifurcating,
+ use.random.resolution=use.random.resolution,
+ scale=scale, verbose=verbose)
+ }
+ else
+ {
+ stop("Cannot coerce the argument into a usable type.")
+ }
+}
+
diff --git a/R/gromov.hyperbolicity.R b/R/gromov.hyperbolicity.R
new file mode 100644
index 0000000..3e55659
--- /dev/null
+++ b/R/gromov.hyperbolicity.R
@@ -0,0 +1,30 @@
+gromov.hyperbolicity <- function(d, deltas = FALSE, scale = NA)
+{
+ d <- as.matrix(as.dist(d))
+ a = dim(d);
+ if(a[1] != a[2])
+ {
+ stop("The parameter could not be coerced into a square distance matrix.")
+ }
+
+ if(a[1] < 4)
+ {
+ stop("At least 4 points must be used to compute the Gromov hyperbolicity constant.")
+ }
+
+ if(is.na(scale))
+ {
+ scale = "none"
+ }
+
+ scaleV = pmatch(scale, c("none", "max", "perimeter"))
+
+ if(length(scaleV) > 1 || is.na(scaleV))
+ {
+ stop("Unknown or ambiguous scale method.");
+ }
+
+ .Call("gromov_distmatrix", as.double(d), as.logical(deltas),
+ as.integer(scaleV), PACKAGE="distory")
+}
+
diff --git a/R/mcmc.target.seq.R b/R/mcmc.target.seq.R
new file mode 100644
index 0000000..4a6295d
--- /dev/null
+++ b/R/mcmc.target.seq.R
@@ -0,0 +1,135 @@
+mcmc.target.seq <- function(data, x, F, n)
+{
+ N = ncol(data)
+ D = rep(1, N)
+
+ likvals = c()
+
+ a = runif(n)
+
+ old.tree = F(lookup.samples(data, list(convert.table.to.idx(D)))[[1]])
+ old.dist = as.matrix(dist.multiPhylo(list(x, old.tree)))[1,2]
+
+ best.tree = old.tree
+ best.dist = old.dist
+
+ for(i in 1:n)
+ {
+ Dp = D
+ good = FALSE
+
+ r = sample.int(N, 2)
+ while(D[r[1]] == N || D[r[2]] == 0)
+ {
+ r = sample.int(N, 2)
+ }
+
+ # propose the swap
+ Dp[r[1]] = Dp[r[1]] + 1;
+ Dp[r[2]] = Dp[r[2]] - 1;
+
+ new.tree = F(lookup.samples(data, list(convert.table.to.idx(Dp)))[[1]])
+ new.dist = as.matrix(dist.multiPhylo(list(x, new.tree)))[1,2]
+
+ v = old.dist/new.dist
+
+ if(v >= 1)
+ {
+ v = 1
+ best.tree = new.tree
+ best.dist = new.dist
+ }
+
+ if(v^(100-exp(1/i)) > a[i])
+ {
+ D = Dp
+ old.tree = new.tree
+ old.dist = new.dist
+ }
+
+ likvals = c(likvals, old.dist)
+ }
+
+ list(data = D, tree = best.tree, distance = best.dist, vals = likvals)
+}
+
+# modified from ape code for boot.phylo
+boot.samples.idxs <- function(data, B = 100, block = 1)
+{
+ if (is.list(data) && !is.data.frame(data)) {
+ if (inherits(data, "DNAbin"))
+ data <- as.matrix(data)
+ else {
+ nm <- names(data)
+ n <- length(data)
+ data <- unlist(data)
+ nL <- length(data)
+ data <- matrix(data, n, nL/n, byrow = TRUE)
+ rownames(data) <- nm
+ }
+ }
+ boot.smpls <- vector("list", B)
+ for (i in 1:B) {
+ if (block > 1) {
+ y <- seq(block, ncol(data), block)
+ boot.i <- sample(y, replace = TRUE)
+ boot.samp <- numeric(ncol(data))
+ boot.samp[y] <- boot.i
+ for (j in 1:(block - 1)) boot.samp[y - j] <- boot.i -
+ j
+ }
+ else boot.samp <- sample(ncol(data), replace = TRUE)
+
+ boot.smpls[[i]] <- boot.samp
+ }
+
+ boot.smpls
+}
+
+lookup.samples <- function(data, idxs)
+{
+ if (is.list(data) && !is.data.frame(data)) {
+ if (inherits(data, "DNAbin"))
+ data <- as.matrix(data)
+ else {
+ nm <- names(data)
+ n <- length(data)
+ data <- unlist(data)
+ nL <- length(data)
+ data <- matrix(data, n, nL/n, byrow = TRUE)
+ rownames(data) <- nm
+ }
+ }
+
+ lapply(idxs, function(i) { data[,i] } )
+}
+
+phylo.diff <- function(x, y, ...)
+{
+ uniqT1 <- distinct.edges(x, y)
+ uniqT2 <- distinct.edges(y, x)
+
+ treeA.cs <- rep("black", dim(x$edge)[1])
+ treeA.cs[uniqT1] = "red"
+
+ treeB.cs <- rep("black", dim(y$edge)[1])
+ treeB.cs[uniqT2] = "red"
+
+ par(mfrow=c(1,2))
+ plot(x, edge.color=treeA.cs, ...)
+ plot(y, edge.color=treeB.cs, ...)
+
+ invisible()
+}
+
+convert.table.to.idx <- function(T)
+{
+ dat = c()
+ for(i in 1:length(T))
+ {
+ dat = c(dat, rep(i, T[i]))
+ }
+
+ dat
+}
+
diff --git a/R/orthant.boundary.tree.R b/R/orthant.boundary.tree.R
new file mode 100644
index 0000000..bcc7f94
--- /dev/null
+++ b/R/orthant.boundary.tree.R
@@ -0,0 +1,32 @@
+orthant.boundary.tree <- function(x, y)
+{
+ e1 <- distinct.edges(x,y)
+ e2 <- distinct.edges(y,x)
+
+ if(length(e1) != 1)
+ stop("Trees must differ by only one edge")
+
+ length.e1 = x$edge.length[e1]
+ length.e2 = y$edge.length[e2]
+
+ lambda = length.e1 / ( length.e1 + length.e2)
+
+ bdy.tree <- x
+
+ partitions <- partition.leaves(x)
+
+ bdy.tree$edge.length <- lapply(partitions,
+ function(e)
+ {
+ a.e <- x$edge.length[edge.from.split(x, e)]
+ b.e <- y$edge.length[edge.from.split(y, e)]
+
+ a.e*(1 - lambda) + b.e * lambda
+ }
+ )
+
+ bdy.tree$edge.length[e1] = 0;
+
+ bdy.tree
+}
+
diff --git a/R/phylo.diff.R b/R/phylo.diff.R
new file mode 100644
index 0000000..fdb61b3
--- /dev/null
+++ b/R/phylo.diff.R
@@ -0,0 +1,79 @@
+phylo.diff <- function(x, y, ...)
+{
+ uniqT1 <- distinct.edges(x, y)
+ uniqT2 <- distinct.edges(y, x)
+
+ treeA.cs <- rep("black", dim(x$edge)[1])
+ treeA.cs[uniqT1] = "red"
+
+ treeB.cs <- rep("black", dim(y$edge)[1])
+ treeB.cs[uniqT2] = "red"
+
+ par(mfrow=c(1,2))
+ plot(x, edge.color=treeA.cs, ...)
+ plot(y, edge.color=treeB.cs, ...)
+
+ invisible()
+}
+
+distinct.edges <- function(x, y) # all edges in x not in y
+{
+ bp1 <- partition.leaves(x)
+ bp1 <- lapply(bp1, sort)
+ bp2 <- partition.leaves(y)
+ bp2 <- lapply(bp2, sort)
+
+ p = c()
+
+ for(i in 1:length(bp1))
+ {
+ if(!(list(bp1[[i]]) %in% bp2))
+ {
+ p <- append(p, i)
+ }
+ }
+
+ p
+}
+
+edge.from.split <- function(x, split)
+{
+ splits <- partition.leaves(x)
+ splits <- lapply(splits, sort)
+ split <- sort(split)
+ match(list(split), splits)
+}
+
+get.bipartition <- function(x, e) # of leaves from edge
+{
+ acc = vector()
+ inc = FALSE;
+ for(i in 1:dim(x$edge)[1])
+ {
+ if(x$edge[i,1] == e)
+ {
+ acc <- append(acc, get.bipartition(x, x$edge[i,2]));
+ inc = TRUE;
+ }
+ }
+
+ if(!inc)
+ {
+ acc = x$tip.label[e];
+ }
+
+ acc
+}
+
+partition.leaves <- function(x) # get all bipartitoins
+{
+ r = list()
+ for(i in 1:dim(x$edge)[1])
+ {
+ t <- get.bipartition(x, x$edge[i,2]);
+ r <- append(r, list(t));
+ }
+
+ r
+}
+
diff --git a/R/position.leverage.R b/R/position.leverage.R
new file mode 100644
index 0000000..423e127
--- /dev/null
+++ b/R/position.leverage.R
@@ -0,0 +1,33 @@
+position.leverage <- function(data, F, to = NULL, rep = 50, by = 1)
+{
+ N = ncol(data)
+ D = rep(1, N)
+
+ leverage.vals = c()
+
+ if(is.null(to))
+ {
+ to = F(lookup.samples(data, list(convert.table.to.idx(D)))[[1]])
+ }
+
+ for(i in seq(1,N,by))
+ {
+ Dp = D;
+ S = sample(N, N/rep);
+ while(i %in% S)
+ {
+ S = sample(N,N/rep);
+ }
+
+ Dp[S] = 0;
+ Dp[i] = N/rep;
+
+ new.tree = F(lookup.samples(data, list(convert.table.to.idx(Dp)))[[1]])
+ leverage.vals = c(leverage.vals, as.matrix(dist.multiPhylo(list(to,
+ new.tree)))[1,2])
+ }
+
+ leverage.vals
+}
+
+
diff --git a/R/zzz.R b/R/zzz.R
new file mode 100644
index 0000000..9391b00
--- /dev/null
+++ b/R/zzz.R
@@ -0,0 +1,5 @@
+.onUnload <- function(libpath)
+{
+ library.dynam.unload("distory", libpath)
+}
+
diff --git a/debian/changelog b/debian/changelog
deleted file mode 100644
index 419a463..0000000
--- a/debian/changelog
+++ /dev/null
@@ -1,5 +0,0 @@
-r-cran-distory (1.4.2-1) unstable; urgency=low
-
- * Initial release (closes: #828857)
-
- -- Andreas Tille <tille at debian.org> Tue, 28 Jun 2016 17:08:15 +0200
diff --git a/debian/compat b/debian/compat
deleted file mode 100644
index ec63514..0000000
--- a/debian/compat
+++ /dev/null
@@ -1 +0,0 @@
-9
diff --git a/debian/control b/debian/control
deleted file mode 100644
index 212a975..0000000
--- a/debian/control
+++ /dev/null
@@ -1,23 +0,0 @@
-Source: r-cran-distory
-Maintainer: Debian Med Packaging Team <debian-med-packaging at lists.alioth.debian.org>
-Uploaders: Andreas Tille <tille at debian.org>
-Section: gnu-r
-Priority: optional
-Build-Depends: debhelper (>= 9),
- cdbs,
- r-base-dev,
- r-cran-ape
-Standards-Version: 3.9.8
-Vcs-Browser: https://anonscm.debian.org/viewvc/debian-med/trunk/packages/R/r-cran-distory/trunk/
-Vcs-Svn: svn://anonscm.debian.org/debian-med/trunk/packages/R/r-cran-distory/trunk/
-Homepage: https://cran.r-project.org/web/packages/distory
-
-Package: r-cran-distory
-Architecture: any
-Depends: ${shlibs:Depends},
- ${misc:Depends},
- ${R:Depends},
- r-cran-ape
-Description: GNU R distance between phylogenetic histories
- This GNU R package enables calculation of geodesic distance between
- phylogenetic trees and associated functions.
diff --git a/debian/copyright b/debian/copyright
deleted file mode 100644
index 06636df..0000000
--- a/debian/copyright
+++ /dev/null
@@ -1,39 +0,0 @@
-Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
-Upstream-Name: distory
-Upstream-Contact: Susan Holmes <susan at stat.stanford.edu>
-Source: https://cran.r-project.org/web/packages/distory/index.html
-
-Files: *
-Copyright: 2012-2016 John Chakerian and Susan Holmes
-License: BSD-3-clause
-
-Files: debian/*
-Copyright: 2016 Andreas Tille <tille at debian.org>
-License: BSD-3-clause
-
-License: BSD-3-clause
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are
- met:
- .
- 1. Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- 2. Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in
- the documentation and/or other materials provided with the
- distribution.
- 3. Neither the name of the copyright holder nor the names of its
- contributors may be used to endorse or promote products derived
- from this software without specific prior written permission.
- .
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
- IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
- TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
- PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/debian/rules b/debian/rules
deleted file mode 100755
index 2fbba2d..0000000
--- a/debian/rules
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/usr/bin/make -f
-
-include /usr/share/R/debian/r-cran.mk
diff --git a/debian/source/format b/debian/source/format
deleted file mode 100644
index 163aaf8..0000000
--- a/debian/source/format
+++ /dev/null
@@ -1 +0,0 @@
-3.0 (quilt)
diff --git a/debian/watch b/debian/watch
deleted file mode 100644
index c2be9eb..0000000
--- a/debian/watch
+++ /dev/null
@@ -1,2 +0,0 @@
-version=3
-http://cran.r-project.org/src/contrib/distory_([-\d.]*)\.tar\.gz
diff --git a/man/bethe.tree.Rd b/man/bethe.tree.Rd
new file mode 100644
index 0000000..0c17261
--- /dev/null
+++ b/man/bethe.tree.Rd
@@ -0,0 +1,56 @@
+\name{bethe.tree}
+\Rdversion{1.1}
+\alias{bethe.tree}
+\title{
+ Bethe Tree
+}
+
+\description{
+ Generates a Bethe tree with given tips, inner edge lengths, and outgroup.
+}
+
+\usage{
+ bethe.tree(tips, level.lengths = NULL, outgroup="O", outgroup.dist=1)
+}
+
+\arguments{
+ \item{tips}{
+ A list of tip names as a character vector. Should be a power of 2. All
+ tip names must be distinct.
+}
+ \item{level.lengths}{
+ Edge lengths for each level, counted from the bottom up. NULL means a
+ default of 1. If the vector isn't long enough, the last value will be
+ repeated as necessary.
+}
+ \item{outgroup}{
+ The tip label for the outgroup.
+}
+ \item{outgroup.dist}{
+ The distance of the outgroup from the root.
+}
+}
+
+\details{
+ Generates a Bethe tree with specified internal edge lengths.
+}
+\value{
+ A class of type \code{phylo} representing the tree.
+}
+\author{
+ John Chakerian <chakj at stanford.edu>
+}
+\references{
+ Chakerian, J. and Holmes, S. P. Computational Tools for Evaluating
+ Phylogenetic and Heirarchical Clustering Trees. arXiv:1006.1015v1.
+}
+
+\seealso{
+ \code{\link[distory]{dist.multiPhylo}}
+}
+
+\examples{
+
+plot(bethe.tree( as.character(1:16), 1:4, "17", 14))
+
+}
diff --git a/man/bin.multiPhylo.Rd b/man/bin.multiPhylo.Rd
new file mode 100644
index 0000000..d0926b1
--- /dev/null
+++ b/man/bin.multiPhylo.Rd
@@ -0,0 +1,60 @@
+\name{bin.multiPhylo}
+\Rdversion{1.1}
+\alias{bin.multiPhylo}
+\title{
+ Bin Trees
+}
+
+\description{
+ Bins trees according to branching topology.
+}
+
+\usage{
+ bin.multiPhylo(treelist)
+}
+
+\arguments{
+ \item{treelist}{
+ A list of trees that can be passed to dist.phylo (see the help for
+ dist.phylo for acceptable formats)
+}
+}
+
+\details{
+ Bins trees according to branching topology. Two trees are considered to have
+ the same topology if the same set of partitions of tips are produced by
+ the edges, which corresponds to the same branching up to rearrangement
+ of tips.
+}
+\value{
+ Returns a numeric vector of bin ids. Bin ids are assigned in order of the
+ first tree in that bin, that is, the first k unique trees in the list
+ passed will be assigned bins 1..k in order of appearance.
+}
+\author{
+ John Chakerian <chakj at stanford.edu>
+}
+\references{
+ Chakerian, J. and Holmes, S. P. Computational Tools for Evaluating
+ Phylogenetic and Heirarchical Clustering Trees. arXiv:1006.1015v1.
+
+}
+
+\seealso{
+ \code{\link[distory]{dist.multiPhylo}}
+}
+
+\examples{
+
+data(woodmouse)
+otree <- root(fastme.ols(dist.dna(woodmouse)), "No305", resolve.root=TRUE)
+breps = 500
+
+trees <- boot.phylo(otree, woodmouse, B=breps, function(x)
+ root(fastme.ols(dist.dna(x)), "No305", resolve.root=TRUE),trees=TRUE)
+
+combined.trees <- c(list(otree), trees$trees)
+
+bin.multiPhylo(combined.trees)
+
+}
diff --git a/man/dist.multiPhylo.Rd b/man/dist.multiPhylo.Rd
new file mode 100644
index 0000000..662360b
--- /dev/null
+++ b/man/dist.multiPhylo.Rd
@@ -0,0 +1,122 @@
+\name{dist.multiPhylo}
+\Rdversion{1.1}
+\alias{dist.multiPhylo}
+\title{
+ Geodesic Distance Between Phylogenetic Trees
+}
+
+\description{
+ Computes the geodesic distance of a list of phylogenetic trees using a
+ polynomial algorithm.
+}
+
+\usage{
+ dist.multiPhylo(x, method="geodesic", force.multi2di = FALSE, outgroup = NULL,
+ convert.multifurcating = FALSE, use.random.resolution = FALSE, scale =
+ NULL, verbose = FALSE)
+}
+
+\arguments{
+ \item{x}{
+ A list of ape trees (class 'phylo'). The list does not have to be of
+ class 'multiPhylo'. The function will also accept a list of strings of
+ trees in Newick format, or a single string with trees in Newick format
+ separated by semicolons. All the trees must have the same tip labels.
+}
+ \item{method}{
+ Determines which distance method is used. Options are 'geodesic' for the
+ tree space geodesic distance, or 'edgeset' for the number of edges
+ (defined by splits of tips) that are different.
+}
+ \item{force.multi2di}{
+ Force conversion of every tree to strict bifurcating through the ape
+ function 'multi2di', using the use.random.resolution as its parameter.
+ This option should not be used in conjunction with specification of an
+ outgroup.
+}
+ \item{outgroup}{
+ Specifies an outgroup to root each tree with respect to. This calls the
+ ape function 'root' on every tree in the list.
+}
+ \item{convert.multifurcating}{
+ Setting this option will check every tree for multifurcations using the
+ ape function 'is.binary.tree' - if it returns FALSE, the ape function
+ 'multi2di' will be called on it. Note that this does not ensure a tree
+ is strictly binary, since ape considers an unrooted tree binary even if
+ the root node is trifurcating. This option can be used in conjunction
+ with specification of an outgroup.
+}
+
+ \item{use.random.resolution}{
+ Specifies the parameter to 'multi2di' if needed.
+}
+ \item{scale}{
+ Specifies a scale to make all trees unformly scaled (that is, the sum of
+ all edges will be uniform)scale to make all trees unformly scaled (that
+ is, the sum of all edge lengths will be uniform). The parameter can
+ either be a tree of class \code{phylo} or a numeric value for the sum of
+ all edge lengths.
+}
+ \item{verbose}{
+ Turns on incremental status updates and more warnings. Helpful for large
+ computations.
+}
+
+}
+\details{
+ This function computes the geodesic distance according to Billera et. al.
+ using an algorithm based off of the polynomial time algorithm of Owen
+ and Provan. Since it corresponds to a formal definition of tree-space as
+ a space of strictly binary trees, no mulifurcations are allowed, including
+ on the root node. In addition, negative and 0-lengthed edges are clamped to
+ a very small value (DBL_MIN) for technical reasons.
+
+ The Newick parser supports only a subset of the Newick format. In
+ particular, it does not at the moment allow for internal node labels, only
+ weights. Weights will be automatically set to 1 if not specified. It may be
+ necessary to clean data in ape to make the trees conform to this.
+}
+\value{
+ Returns a distance matrix of class 'dist' representing the pairwise
+ geodesic distances between all input trees. Keep in mind this distance
+ matrix is not Euclidean. N/A values are provided in the case of an error in
+ determining the distance.
+}
+\references{
+ Chakerian, J. and Holmes, S. P. Computational Tools for Evaluating
+ Phylogenetic and Heirarchical Clustering Trees. arXiv:1006.1015v1.
+
+ Billera, L. J., Holmes, S. P., and Vogtmann, K. (2001) Geometry of the space
+ of phylogenetic trees. _Adv. Appl. Math_, *27*, 733-767.
+
+ Megan Owen, J. Scott Provan, "A Fast Algorithm for Computing Geodesic
+ Distances in Tree Space," IEEE/ACM Transactions on Computational Biology
+ and Bioinformatics, 14 Jan. 2010.
+}
+\author{
+ John Chakerian <chakj at stanford.edu>
+}
+
+\seealso{
+ \code{\link[ape]{dist.dna}}
+ \code{\link[ape]{boot.phylo}}
+ \code{\link{cmdscale}}
+}
+
+\examples{
+
+data(woodmouse)
+otree <- root(nj(dist.dna(woodmouse)), "No305", resolve.root=TRUE)
+breps = 250
+
+trees <- boot.phylo(otree, woodmouse, B=breps, function(x)
+ root(nj(dist.dna(x)), "No305", resolve.root=TRUE),trees=TRUE)
+
+combined.trees <- c(list(otree), trees$trees)
+tree.dists <- dist.multiPhylo(combined.trees)
+
+mdres <- cmdscale(tree.dists, k=breps, add=TRUE)
+plot(mdres$points[,1], mdres$points[,2], col = c("red", rep("black", breps)))
+text(mdres$points[,1], mdres$points[,2], ,labels=1:(breps+1), cex=0.7, adj=c(0,2))
+
+}
diff --git a/man/distory-package.Rd b/man/distory-package.Rd
new file mode 100644
index 0000000..3c4f234
--- /dev/null
+++ b/man/distory-package.Rd
@@ -0,0 +1,51 @@
+\name{distory-package}
+\Rdversion{1.1}
+\alias{distory-package}
+\alias{distory}
+\docType{package}
+\title{
+ Distance Between Phylogenetic Histories
+}
+\description{
+ The 'distory' package provides functions for computing geodesic distances
+ between phylogenetic trees, as well as functions which use this distance.
+ Methods for computing Gromov delta-hyperbolicity, Markov Chain Monte Carlo
+ routines in tree space, and per-position leverage for DNA sequences are
+ included.
+}
+\details{
+ A description of the algorithm used for the distance computation can be found
+ in help(dist.multiPhylo).
+}
+\author{
+ John Chakerian <chakj at stanford.edu> and Susan Holmes <susan at stat.stanford.edu>
+}
+\references{
+ Chakerian, J. and Holmes, S. P. Computational Tools for Evaluating
+ Phylogenetic and Heirarchical Clustering Trees. arXiv:1006.1015v1.
+
+ Billera, L. J., Holmes, S. P., and Vogtmann, K. (2001) Geometry of the space
+ of phylogenetic trees. _Adv. Appl. Math_, *27*, 733-767.
+
+ Megan Owen, J. Scott Provan, "A Fast Algorithm for Computing Geodesic
+ Distances in Tree Space," IEEE/ACM Transactions on Computational Biology
+ and Bioinformatics, 14 Jan. 2010.
+}
+\keyword{ package }
+\examples{
+
+data(woodmouse)
+otree <- root(nj(dist.dna(woodmouse)), "No305", resolve.root=TRUE)
+breps = 250
+
+trees <- boot.phylo(otree, woodmouse, B=breps, function(x)
+ root(nj(dist.dna(x)), "No305", resolve.root=TRUE),trees=TRUE)
+
+combined.trees <- c(list(otree), trees$trees)
+tree.dists <- dist.multiPhylo(combined.trees)
+
+mdres <- cmdscale(tree.dists, k=breps, add=TRUE)
+plot(mdres$points[,1], mdres$points[,2], col = c("red", rep("black", breps)))
+text(mdres$points[,1], mdres$points[,2], labels=1:(breps+1), cex=0.7, adj=c(0,2))
+
+}
diff --git a/man/gromov.hyperbolicity.Rd b/man/gromov.hyperbolicity.Rd
new file mode 100644
index 0000000..67600f6
--- /dev/null
+++ b/man/gromov.hyperbolicity.Rd
@@ -0,0 +1,75 @@
+\name{gromov.hyperbolicity}
+\Rdversion{1.1}
+\alias{gromov.hyperbolicity}
+\title{
+ Gromov Hyperbolicity Constant
+}
+
+\description{
+ Computes the Gromov Hyperbolicity Constant of a distance matrix.
+}
+
+\usage{
+ gromov.hyperbolicity(d, deltas = FALSE, scale = NA)
+}
+
+\arguments{
+ \item{d}{
+ A distance matrix of type \code{dist} or \code{matrix}, or anything that
+ can be coerced into \code{dist} by \code{as.dist}. Must have at least 4
+ points.
+}
+ \item{deltas}{
+ A logical value specifying whether to return the vector of delta values.
+ Default is \code{FALSE}.
+}
+ \item{scale}{
+ Specifies a scaling method for each delta. Default is no scaling (NA or
+ "none"). Available methods are "max" which scales deltas by the max of the
+ sums computed, and "perimeter" for the largest perimeter of the four points.
+}
+}
+
+\details{
+ This computes a constant that represents the relaxation of a 4-point
+ condition for delta-hyperbolicity. See (Gromov 1987) for details.
+}
+
+\value{
+ The Gromov hyperbolicity constant of the given distance matrix.
+}
+
+\author{
+ John Chakerian <chakj at stanford.edu>
+}
+
+\seealso{
+ \code{\link[distory]{dist.multiPhylo}}
+}
+
+\references{
+ M. Gromov. Hyperbolic groups. In Essays in Group Theory, pages 73-263.
+ Springer, New York, 1987.
+
+ Chakerian, J. and Holmes, S. P. Computational Tools for Evaluating
+ Phylogenetic and Heirarchical Clustering Trees. arXiv:1006.1015v1.
+}
+
+\examples{
+
+# scale final delta by max distance
+points <- cbind(runif(100), runif(100))
+d <- dist(points)
+gromov.hyperbolicity(d)/max(d)
+
+# scale each delta by max distance for the 4 points
+points <- cbind(runif(100), runif(100))
+d <- dist(points)
+gromov.hyperbolicity(d, scale="max")
+
+# scale each delta by the max perimeter for the 4 points
+points <- cbind(runif(100), runif(100))
+d <- dist(points)
+gromov.hyperbolicity(d, scale="max")
+
+}
diff --git a/man/mcmc.target.seq.Rd b/man/mcmc.target.seq.Rd
new file mode 100644
index 0000000..939540f
--- /dev/null
+++ b/man/mcmc.target.seq.Rd
@@ -0,0 +1,136 @@
+\name{mcmc.target.seq}
+\Rdversion{1.1}
+\alias{mcmc.target.seq}
+\alias{boot.samples.idxs}
+\alias{lookup.samples}
+\alias{convert.table.to.idx}
+
+\title{
+ Find MCMC Target Sequence
+}
+
+\description{
+ \code{mcmc.target.seq} uses MCMC to find a configuration of DNA positions
+ to get as close as possible to a given tree.
+
+ \code{boot.samples.idxs} bootstraps over indices into a DNA matrix.
+
+ \code{lookup.samples} goes from an index representation of a configuration
+ of DNA to the actual DNAbin format.
+
+ \code{convert.table.to.idx} converts a table of counts for positions 1..n
+ into a list of indices corresponding to positions (i.e. goes from the
+ tabled form to a vector whose tabling matches the input).
+}
+
+\usage{
+ mcmc.target.seq(data, x, F, n)
+
+ boot.samples.idxs(data, B = 100, block = 1)
+
+ lookup.samples(data, idxs)
+
+ convert.table.to.idx(T)
+}
+
+\arguments{
+ \item{data}{
+ A DNA matrix in DNAbin format.
+}
+ \item{x}{
+ A tree of class 'phylo' to estimate.
+}
+ \item{F}{
+ A tree estimation function, accepting a DNA matrix in DNAbin format and
+ returning a tree of class 'phylo.'
+}
+ \item{n}{
+ The number of MCMC iterations to perform
+}
+ \item{B}{
+ The number of bootstrap replicates.
+}
+ \item{block}{
+ The block size to use during bootstrapping.
+}
+ \item{idxs}{
+ A list of numeric vectors of indices to use for lookup.
+}
+ \item{T}{
+ A table or table-like vector to convert.
+}
+}
+
+\details{
+ \code{mcmc.target.seq} performs an MCMC with simulated annealing to locate
+ a configuration of DNA positions from the original matrix that gets as
+ close as possible to a target tree. Propositions for the MCMC replacing
+ one character with another uniformly at random.
+
+ The remaining functions are intended to be used as support functions.
+}
+\value{
+ \code{mcmc.target.seq} returns a list of 4 elements: a numeric vector of
+ counts of each position in the original matrix, the best estimated
+ tree, the best distance from the estimated tree to the target tree, and
+ a numeric vector of the distances for every iteration of the
+ simulation.
+
+ \code{boot.samples.idxs} returns a numeric vector representing the
+ bootstrapped idices.
+
+ \code{lookup.samples} returns a list of objects of class DNAbin
+ corresponding to the DNA sequences generated from indices into the original
+ DNA matrix.
+
+ \code{convert.table.to.idx} returns a numeric vector of indices based on
+ the table counts.
+}
+\author{
+ John Chakerian <chakj at stanford.edu>
+}
+\references{
+ Chakerian, J. and Holmes, S. P. Computational Tools for Evaluating
+ Phylogenetic and Heirarchical Clustering Trees. arXiv:1006.1015v1.
+}
+
+\seealso{
+ \code{\link[distory]{dist.multiPhylo}}
+ \code{\link[distory]{orthant.boundary.tree}}
+}
+
+\examples{
+
+data(woodmouse)
+otree <- root(fastme.ols(dist.dna(woodmouse)), "No305", resolve.root=TRUE)
+breps = 200
+
+trees <- boot.phylo(otree, woodmouse, B=breps, function(x)
+ root(fastme.ols(dist.dna(x)), "No305", resolve.root=TRUE),trees=TRUE)
+
+combined.trees <- c(list(otree), trees$trees)
+
+binning <- bin.multiPhylo(combined.trees)
+
+tree.a <- combined.trees[[match(1, binning)]]
+i = 2
+max.bin = max(binning)
+tree.b <- combined.trees[[match(2, binning)]]
+
+while(length(distinct.edges(tree.a,tree.b)) > 1 && i < max.bin)
+{
+ i = i + 1
+ tree.b = combined.trees[[match(i, binning)]]
+}
+
+bdy.tree <- orthant.boundary.tree(tree.a, tree.b)
+
+f.est <- function(x) root(nj(dist.dna(x)), "No305", resolve.root=TRUE)
+
+res <- mcmc.target.seq(woodmouse, bdy.tree, f.est, 1000)
+
+par(mfrow=c(2,1))
+plot(res$tree)
+plot(res$vals)
+
+}
diff --git a/man/orthant.boundary.tree.Rd b/man/orthant.boundary.tree.Rd
new file mode 100644
index 0000000..646b2b7
--- /dev/null
+++ b/man/orthant.boundary.tree.Rd
@@ -0,0 +1,73 @@
+\name{orthant.boundary.tree}
+\Rdversion{1.1}
+\alias{orthant.boundary.tree}
+\title{
+ Orthant Boundary Tree
+}
+
+\description{
+ Produces a degenerate tree on the boundary between trees that differ by one split.
+}
+
+\usage{
+ orthant.boundary.tree(x,y)
+}
+
+\arguments{
+ \item{x}{
+ The tree in the first orthant.
+}
+ \item{y}{
+ The tree in the second orthant.
+}
+}
+
+\details{
+ The tree found is the tree on the boundary between the two orthants such
+ that it is on the straight line connecting the two trees when one
+ orthant is thought of as being the (-,+) quadrant and the second
+ orthant as being the (+,+) quadrant, where the (0,y) line is the particular
+ boundary in question.
+}
+\value{
+ Returns an object of class 'phylo' representing the boundary tree.
+}
+\author{
+ John Chakerian <chakj at stanford.edu>
+}
+\references{
+ Chakerian, J. and Holmes, S. P. Computational Tools for Evaluating
+ Phylogenetic and Heirarchical Clustering Trees. arXiv:1006.1015v1.
+}
+
+\seealso{
+ \code{\link[distory]{mcmc.target.seq}}
+}
+
+\examples{
+
+data(woodmouse)
+otree <- root(fastme.ols(dist.dna(woodmouse)), "No305", resolve.root=TRUE)
+breps = 200
+
+trees <- boot.phylo(otree, woodmouse, B=breps, function(x)
+ root(fastme.ols(dist.dna(x)), "No305", resolve.root=TRUE),trees=TRUE)
+
+combined.trees <- c(list(otree), trees$trees)
+
+binning <- bin.multiPhylo(combined.trees)
+
+tree.a <- combined.trees[[match(1, binning)]]
+i = 2
+max.bin = max(binning)
+tree.b <- combined.trees[[match(2, binning)]]
+
+while(length(distinct.edges(tree.a,tree.b)) > 1 && i < max.bin)
+{
+ i = i + 1
+ tree.b = combined.trees[[match(i, binning)]]
+}
+
+plot(orthant.boundary.tree(tree.a, tree.b))
+
+}
diff --git a/man/phylo.diff.Rd b/man/phylo.diff.Rd
new file mode 100644
index 0000000..ea07c69
--- /dev/null
+++ b/man/phylo.diff.Rd
@@ -0,0 +1,107 @@
+\name{phylo.diff}
+\Rdversion{1.1}
+\alias{phylo.diff}
+\alias{distinct.edges}
+\alias{edge.from.split}
+\alias{get.bipartition}
+\alias{partition.leaves}
+
+\title{
+ Differences Between Phylogenetic Trees
+}
+
+\description{
+ A family of functions for determining and plotting the differences between
+ two trees.
+
+ \code{phylo.diff} plots two trees side by side, highlighting edges unique
+ to each tree in red.
+
+ \code{distinct.edges} finds the edges present in the first argument not in
+ the second.
+
+ \code{edge.from.split} locates the edge id from a given split.
+
+ \code{get.bipartition} gets the bipartition of tips formed by a single edge.
+
+ \code{partition.leaves} returns the set of all bipartitions from all edges.
+}
+
+\usage{
+ phylo.diff(x, y, ...)
+
+ distinct.edges(x, y)
+
+ edge.from.split(x, split)
+
+ get.bipartition(x, e)
+
+ partition.leaves(x)
+
+}
+
+\arguments{
+ \item{x}{
+ The first (or only) tree.
+}
+ \item{y}{
+ The second tree, for the functions that accept two trees.
+}
+ \item{split}{
+ A list of bipartitions, probably from \code{partition.leaves}.
+}
+ \item{e}{
+ An edge for a particular tree, given as an id.
+}
+ \item{...}{
+ Additional arguments to pass to the \code{plot.phylo} function.
+}
+}
+
+\details{
+ \code{phylo.diff} uses the ape tree plotting function. The other functions
+ are mostly meant as support functions.
+}
+\value{
+ \code{phylo.diff} returns invisible.
+
+ \code{distinct.edges} returns a numeric vector of edge ids for the first
+ tree.
+
+ \code{edge.from.split} returns an edge id for a particular tree
+ corresponding to a given bipartition and NA if none such edge exists.
+
+ \code{get.bipartition} returns a character vector of the tips below that
+ edge in the given tree.
+
+ \code{partition.leaves} returns a list of partitions (themselves character
+ vectors) of the given tree.
+}
+\author{
+ John Chakerian <chakj at stanford.edu>
+}
+\references{
+ Chakerian, J. and Holmes, S. P. Computational Tools for Evaluating
+ Phylogenetic and Heirarchical Clustering Trees. arXiv:1006.1015v1.
+}
+
+\seealso{
+ \code{\link[distory]{dist.multiPhylo}}
+}
+
+\examples{
+
+data(woodmouse)
+otree <- root(fastme.ols(dist.dna(woodmouse)), "No305", resolve.root=TRUE)
+breps = 10
+
+trees <- boot.phylo(otree, woodmouse, B=breps, function(x)
+ root(fastme.ols(dist.dna(x)), "No305", resolve.root=TRUE),trees=TRUE)
+
+combined.trees <- c(list(otree), trees$trees)
+
+binning <- bin.multiPhylo(combined.trees)
+
+phylo.diff(combined.trees[[match(1, binning)]], combined.trees[[match(2, binning)]])
+
+}
diff --git a/man/position.leverage.Rd b/man/position.leverage.Rd
new file mode 100644
index 0000000..abe7233
--- /dev/null
+++ b/man/position.leverage.Rd
@@ -0,0 +1,71 @@
+\name{position.leverage}
+\Rdversion{1.1}
+\alias{position.leverage}
+\title{
+ Position Leverage
+}
+
+\description{
+ Provides a rough heuristic for determining the degree to which each
+ position in the DNA matrix affects the resulting tree.
+}
+
+\usage{
+ position.leverage(data, F, to = NULL, rep = 50, by = 1)
+}
+
+\arguments{
+ \item{data}{
+ A DNA matrix in \code{DNAbin} format.
+}
+ \item{F}{
+ A tree estimation function, accepting a DNA matrix of class \code{DNAbin}
+ and returning a tree of class \code{phylo}
+}
+ \item{to}{
+ The tree with which distances are measured in respect to, or \code{NULL}
+ to indicate the tree estimated by \code{F} for the starting DNA matrix.
+}
+ \item{rep}{
+ The number of times to replicate the position in question.
+}
+ \item{by}{
+ The function will perform the calculation on every \code{by}-th position
+ (that is, on \code{ seq(1,N,by) } )
+}
+}
+
+\details{
+ This function takes a DNA matrix and, for every \code{by}-th position,
+ replicates that position \code{rep} times, randomly removing
+ \code{rep} other positions to keep all sequences the same length
+ other positions to keep all sequences the same length. For each new DNA
+ matrix created in this way, \code{F} is used to estimate the corresponding
+ tree, and the distance to tree \code{to} is computed and stored. This
+ distance can be thought of as somewhat analogous to the leverage of that
+ position.
+}
+\value{
+ Returns a numeric vector of distances from tree \code{to} for each position
+ sampled.
+}
+\author{
+ John Chakerian <chakj at stanford.edu>
+}
+\references{
+ Chakerian, J. and Holmes, S. P. Computational Tools for Evaluating
+ Phylogenetic and Heirarchical Clustering Trees. arXiv:1006.1015v1.
+}
+
+\seealso{
+ \code{\link[distory]{dist.multiPhylo}}
+}
+
+\examples{
+
+data(woodmouse)
+f.est <- function(x) root(nj(dist.dna(x)), "No305", resolve.root=TRUE)
+position.leverage(woodmouse, f.est, by=10)
+
+}
+
diff --git a/src/newick.cpp b/src/newick.cpp
new file mode 100644
index 0000000..f90fa11
--- /dev/null
+++ b/src/newick.cpp
@@ -0,0 +1,318 @@
+/*
+ * Polynomial Time Distance Algorithm for Binary Rooted Phylogenetic Trees
+ *
+ * Note: Algorithm based on presentation by Scott Provan / Megan Owen,
+ * "Computing Geodesic Distances in Tree Space in Polynomial Time"
+ *
+ * John Chakerian
+ * chakj at stanford.edu
+ *
+ * Last edited: Apr 22 2010
+ *
+ */
+
+#include <iostream>
+#include <algorithm>
+#include <cstdlib>
+#include <cctype>
+#include <set>
+#include <string>
+#include <vector>
+#include <stack>
+
+#include <R.h>
+
+#include "newick.h"
+
+// Optionally parse a floating point value after a ':' to indicate edge weight.
+// Sets the weight to 1 if no ':' is found.
+
+double ParseWeight(const string &str, unsigned int pos, unsigned int *off)
+{
+ double w = 1;
+ if(str[pos] == ':')
+ {
+ char* e;
+
+ string s = str.substr(pos+1);
+ const char* cs = s.c_str();
+
+ w = strtod(cs, &e);
+ if(cs == e)
+ {
+ w = 1;
+ }
+ else
+ {
+ pos += (e - cs)+1;
+ }
+ }
+
+ if(off)
+ *off = pos;
+
+ return w;
+}
+
+map<string,unsigned int> AssignLeafLabels(const string &str)
+{
+ unsigned int ctr = 0;
+ map<string, unsigned int> ret;
+
+ bool rec = false;
+ string t = "";
+ for(unsigned int i = 0; i < str.size(); i++)
+ {
+ if(str[i] == ' ') continue;
+
+ if(str[i] == '(' || str[i] == ',')
+ {
+ rec = true;
+ continue;
+ }
+
+ if(rec)
+ {
+ if(isalpha(str[i]) || isdigit(str[i]) || str[i] == '_' || str[i] == '-')
+ t += str[i];
+ else
+ {
+ ret[t] = ctr;
+ t = "";
+ ctr++;
+ rec = false;
+ }
+ }
+ }
+
+ return ret;
+}
+
+// We'll use a simple state machine to parse over the trees. Technically this
+// isn't the best solution - a PDA would be more appropriate, however we're not
+// interested in a full parse, only the partitions caused by each edge. Because
+// of this, we can avoid recursion or any sort of tree data structure and just
+// go right to the partitions.
+
+PhyEdgeSet NewickParse(const string &str, map<string,unsigned int> &name_to_pos)
+{
+ unsigned int pos = 0; // position in string
+ unsigned int ctr = 0; // nesting level
+ unsigned int id_ctr = 0; // for assigning each a unique id
+ string errstr;
+
+ vector< set<string> > in_leaves(30);
+ stack<int> commas; // for ensuring trees are bifurcating
+
+ vector<PhyEdge> edges;
+
+ if(str.length() == 0)
+ {
+ return vector<PhyEdge>();
+ }
+
+ // move past leading whitespace
+ while(str[pos] == ' ' || str[pos] == '\t')
+ pos++;
+
+ // we can bail out early here if we have an empty string or a ; character
+ if(str[pos] == ';')
+ {
+ return vector<PhyEdge>();
+ }
+
+ // if the last non-whitespace character isn't a ;, bail out
+ size_t tpos = str.length()-1;
+ while(tpos > 0 && isspace(str[tpos]))
+ tpos--;
+
+ if(tpos == 0) // empty string
+ {
+ return vector<PhyEdge>();
+ }
+
+ if(tpos > 0 && str[tpos] != ';')
+ {
+ errstr = "Tree not terminated with ';'";
+ goto error;
+ }
+
+ goto start_state;
+
+start_state:
+ if(str[pos] == ';')
+ goto finish;
+ else if(str[pos] == '(')
+ goto new_nesting;
+ else
+ {
+ errstr = "Parse error at first character. A tree must have at least 2 taxa.";
+ goto error;
+ }
+
+new_nesting:
+ ctr++;
+ in_leaves.push_back(set<string>());
+ commas.push(0);
+
+ pos++; // move to first character of new nesting
+
+ // move past whitespace
+ while(str[pos] == ' ' || str[pos] == '\t')
+ pos++;
+
+ if(str[pos] == '(')
+ goto new_nesting; // start the process over again
+ else if(isalpha(str[pos]) || isdigit(str[pos]) || str[pos] == '_' || str[pos] == '-')
+ goto leaf_entry;
+ else if(str[pos] == ':')
+ {
+ errstr = "Leaf nodes must have string labels.";
+ goto error;
+ }
+ else if(str[pos] == ')')
+ {
+ errstr = "Empty blocks of form () are not allowed.";
+ goto error;
+ }
+ else
+ {
+ errstr = "Parse error after '('";
+ goto error;
+ }
+
+leaf_entry:
+ // move past whitespace
+ while(str[pos] == ' ' || str[pos] == '\t')
+ pos++;
+
+ // add the edge to the leaf
+ {
+ // get the leaf name
+ unsigned int spos = pos;
+ while(isalpha(str[pos]) || isdigit(str[pos]) || str[pos] == '_' || str[pos] == '-')
+ {
+ pos++;
+ }
+
+ string leaflabel = str.substr(spos,pos-spos);
+
+ PhyEdge e(name_to_pos.size());
+ e.id = id_ctr++;
+ double w = ParseWeight(str,pos,&pos);
+ e.weight = w;
+ e.split[ name_to_pos[leaflabel] ] = false;
+
+ edges.push_back(e);
+
+ for(unsigned int i = 0; i < in_leaves.size(); i++)
+ {
+ in_leaves[i].insert(leaflabel);
+ }
+ }
+
+ // move past extra whitespace
+ while(str[pos] == ' ' || str[pos] == '\t')
+ pos++;
+
+ // decide what to do next
+ if(str[pos] == ',')
+ {
+ commas.top()++;
+ if(commas.top() > 1)
+ {
+ errstr = "More than one comma detected in a nesting.";
+ goto error;
+ }
+ pos++;
+ // move past whitespace
+ while(str[pos] == ' ' || str[pos] == '\t')
+ pos++;
+
+ if(str[pos] == '(')
+ goto new_nesting;
+ else if(isdigit(str[pos]) || isalpha(str[pos]) || str[pos] == '_' || str[pos] == '-')
+ goto leaf_entry;
+ else
+ {
+ errstr = "Couldn't figure out what to do when moving past a ','.";
+ goto error;
+ }
+ }
+ else if(str[pos] == ')')
+ goto close_nesting;
+ else
+ {
+ errstr = "Couldn't figure out what to do after a first leaf entry ";
+ goto error;
+ }
+
+close_nesting:
+ if(str[pos] != ')')
+ {
+ errstr = "Close-nesting expected but no end-bracket found.";
+ goto error;
+ }
+ pos++;
+ if(str[pos] == ';')
+ goto finish;
+
+ // add the edge
+ {
+ PhyEdge e(name_to_pos.size());
+ e.id = id_ctr++;
+ e.weight = ParseWeight(str,pos,&pos);
+
+ for(set<string>::iterator it = in_leaves[in_leaves.size()-1].begin(); it != in_leaves[in_leaves.size()-1].end(); ++it)
+ e.split[ name_to_pos[*it] ] = false;
+
+ edges.push_back(e);
+ }
+ in_leaves.pop_back();
+ commas.pop();
+
+ if(str[pos] == ';')
+ goto finish;
+
+ // decide what to do next
+ if(str[pos] == ',')
+ {
+ commas.top()++;
+ if(commas.top() > 1)
+ {
+ errstr = "More than one comma detected in a nesting. Trees MUST be binary.";
+ goto error;
+ }
+ pos++;
+ // move past whitespace
+ while(str[pos] == ' ' || str[pos] == '\t')
+ pos++;
+
+ if(str[pos] == '(')
+ goto new_nesting;
+ else if(isdigit(str[pos]) || isalpha(str[pos]) || str[pos] == '_' || str[pos] == '-')
+ goto leaf_entry;
+ else
+ {
+ errstr = "Couldn't figure out what to do when moving past a ','.";
+ goto error;
+ }
+ }
+ else if(str[pos] == ')')
+ goto close_nesting;
+ else
+ {
+ errstr = "Couldn't figure out what to do after a second leaf entry";
+ goto error;
+ }
+
+error:
+ if(errstr == "") errstr = "Parser ran off the edge.";
+ error("An error was encountered in parsing near position %d: %s\n", pos, errstr.c_str());
+
+ return vector<PhyEdge>();
+
+finish:
+ return edges;
+}
+
diff --git a/src/newick.h b/src/newick.h
new file mode 100644
index 0000000..09705d8
--- /dev/null
+++ b/src/newick.h
@@ -0,0 +1,19 @@
+#ifndef _NEWICK_H_
+#define _NEWICK_H_
+
+#include <vector>
+#include <string>
+
+#include "phylo.h"
+
+using namespace std;
+
+/* Parse a single tree in Newick format */
+PhyEdgeSet NewickParse(const string &str, map<string,unsigned int> &name_to_pos);
+
+/* Use a single tree to assign each leaf label an integer ID to be used for all
+ * future parsing */
+map<string,unsigned int> AssignLeafLabels(const string &str);
+
+#endif /* _NEWICK_H_ */
+
diff --git a/src/phydist_r.cpp b/src/phydist_r.cpp
new file mode 100644
index 0000000..1cc5871
--- /dev/null
+++ b/src/phydist_r.cpp
@@ -0,0 +1,383 @@
+/*
+ * Polynomial Time Distance Algorithm for Binary Rooted Phylogenetic Trees
+ *
+ * Note: Algorithm based on presentation by Scott Provan / Megan Owen,
+ * "Computing Geodesic Distances in Tree Space in Polynomial Time"
+ *
+ * John Chakerian
+ * chakj at stanford.edu
+ *
+ * Last edited: Feb 28 2010
+ *
+ */
+
+#include <algorithm>
+#include <vector>
+#include <fstream>
+#include <iostream>
+
+#include "treedist.h"
+
+#include <R.h>
+#include <Rinternals.h>
+
+using namespace std;
+
+int compute_phylo_distance_matrix(vector<string> trees_in, bool verbose, double *m);
+void build_tree_list(std::vector<string> &trees_in, std::vector<PhyEdgeSet> &trees, bool verbose);
+double gromov_graycode(double *m, size_t n, double* deltas, int scale);
+
+inline bool edge_less_than(const PhyEdge &a, const PhyEdge &b)
+{
+ for(unsigned int i = 0; i < a.split.size(); i++)
+ {
+ if(a.split[i] < b.split[i])
+ return true;
+ if(b.split[i] < a.split[i])
+ return false;
+ }
+ // if we're here, they're equal, so a is NOT less than b.
+ return false;
+}
+
+inline int edgeset_difference(const PhyEdgeSet &a, const PhyEdgeSet &b)
+{
+ int sim = 0;
+ for(unsigned int i = 0; i < a.size(); i++)
+ {
+ for(unsigned int j = 0; j < a.size(); j++)
+ {
+ if(a[i] == b[j])
+ {
+ sim++;
+ break;
+ }
+ }
+ }
+
+ return a.size() - sim;
+}
+
+extern "C"
+{
+ // returns a distance matrix of doubles
+ SEXP phycpp_compute_tree_distance_set(SEXP trees, SEXP verbose)
+ {
+ SEXP distmatrix;
+
+ bool be_verbose = asLogical(verbose);
+
+ // get number of trees
+ int len = length(trees);
+
+ vector<string> treevec(len);
+
+ // convert 'trees' to a vector of tree strings
+ for(int i = 0; i < len; i++)
+ treevec[i] = CHAR(STRING_ELT(VECTOR_ELT(trees,i),0));
+
+ PROTECT(distmatrix = allocMatrix(REALSXP,len, len));
+
+ compute_phylo_distance_matrix(treevec, be_verbose, REAL(distmatrix));
+
+ // replace -1 values with R N/A values
+ for(int i = 0; i < len*len; i++)
+ if(REAL(distmatrix)[i] == -1)
+ REAL(distmatrix)[i] = R_NaReal;
+
+ UNPROTECT(1);
+
+ return distmatrix;
+ }
+
+ SEXP multiset_diff_integer(SEXP set1, SEXP set2)
+ {
+ unsigned len1 = length(set1);
+ int *s1 = INTEGER(set1);
+ unsigned len2 = length(set2);
+ int *s2 = INTEGER(set2);
+
+ SEXP newset;
+ PROTECT(newset = allocVector(INTSXP, len1));
+ int *a = INTEGER(newset);
+
+ unsigned p=0;
+ for(unsigned i = 0; i < len1; i++)
+ {
+ bool keep = true;
+ for(unsigned j = 0; j < len2; j++)
+ {
+ if(s1[i] == s2[j])
+ {
+ keep = false;
+ break;
+ }
+ }
+
+ if(keep)
+ a[p++] = s1[i];
+ }
+
+ for(unsigned i = p; i < len1; i++)
+ {
+ a[i] = NA_INTEGER;
+ }
+
+
+ UNPROTECT(1);
+ return newset;
+ }
+
+ // returns edge differences
+ SEXP phycpp_bin_trees(SEXP treelist)
+ {
+ // get number of trees
+ int len = length(treelist);
+
+ vector<string> treevec(len);
+
+ // convert 'trees' to a vector of tree strings
+ for(int i = 0; i < len; i++)
+ treevec[i] = CHAR(STRING_ELT(VECTOR_ELT(treelist,i),0));
+
+ std::vector<PhyEdgeSet> trees;
+ build_tree_list(treevec, trees, false);
+
+ SEXP distmatrix;
+ PROTECT(distmatrix = allocMatrix(REALSXP,len,len));
+ double *d = REAL(distmatrix);
+
+ unsigned int sz = trees.size();
+
+ // zero out diagonal
+ for(unsigned int i = 0; i < trees.size(); i++)
+ d[sz*i + i] = 0;
+
+ for(unsigned int i = 0; i < trees.size(); i++)
+ {
+ for(unsigned int j = i; j < trees.size(); j++)
+ {
+ int sim = edgeset_difference(trees[i], trees[j]);
+ d[sz*i + j] = sim;
+ d[sz*j + i] = sim;
+ }
+ }
+
+ UNPROTECT(1);
+
+ return distmatrix;
+ }
+
+ SEXP gromov_distmatrix(SEXP distmatrix, SEXP bDeltas, SEXP scale_method)
+ {
+ bool list_deltas = asLogical(bDeltas);
+ int scaleM = asInteger(scale_method);
+
+ unsigned len = length(distmatrix);
+ unsigned n = sqrt(static_cast<double>(len));
+ double *d = REAL(distmatrix);
+
+ SEXP g;
+
+ if(list_deltas)
+ {
+ PROTECT(g = allocVector(REALSXP, (n*(n-1)*(n-2)*(n-3))/(4*3*2)));
+ gromov_graycode(d, n, REAL(g), scaleM);
+ UNPROTECT(1);
+ }
+ else
+ {
+ PROTECT(g = allocVector(REALSXP, 1));
+ REAL(g)[0] = gromov_graycode(d, n, NULL, scaleM);
+ UNPROTECT(1);
+ }
+
+ return g;
+ }
+}
+
+void build_tree_list(std::vector<string> &trees_in, std::vector<PhyEdgeSet> &trees, bool verbose)
+{
+ string t;
+
+ // build the leaf label LUT
+ t = trees_in[0];
+ map<string,unsigned int> strtbl = AssignLeafLabels(t);
+
+ for(unsigned int treeno = 0; treeno < trees_in.size(); treeno++)
+ {
+ t = trees_in[treeno];
+
+ if(verbose)
+ Rprintf("Parsing tree %d\n", treeno);
+
+ PhyEdgeSet tr = NewickParse(t, strtbl);
+
+ ClampNegativeWeights(tr);
+ trees.push_back(tr);
+ }
+}
+
+int compute_phylo_distance_matrix(vector<string> trees_in, bool verbose, double *distance_matrix)
+{
+ std::vector<PhyEdgeSet> trees;
+ build_tree_list(trees_in, trees, verbose);
+
+ int ctr = 0;
+ int tot = 0.5 * (trees.size() * (trees.size() - 1));
+
+ // figure out how big the trees are
+ int k = -1;
+ while(trees[++k].size() == 0) ;
+
+ stl_bool *incompatibility_buffer = new stl_bool[trees[k].size()*trees[k].size()];
+
+ for(unsigned int j = 0; j < trees.size(); j++) // cols
+ {
+ for(unsigned int i = 0; i < j; i++) // rows
+ {
+ ctr++;
+ if(verbose)
+ Rprintf("%d/%d\t\t[%3.2f%%]\n", ctr,tot, (ctr/(double)tot)*100.0);
+
+ if(trees[i].size() == 0 || trees[j].size() == 0) // mark invalid trees with -1 so we can mark them as N/A later
+ {
+ distance_matrix[i*trees.size()+j] = -1;
+ distance_matrix[j*trees.size()+i] = -1;
+ }
+ else
+ {
+ double d = TreeDistance(trees[i],trees[j], incompatibility_buffer);
+ distance_matrix[i*trees.size()+j] = d;
+ distance_matrix[j*trees.size()+i] = d;
+ }
+
+ }
+ }
+
+ delete [] incompatibility_buffer;
+
+ // fill in the diagonal
+ for(unsigned int i = 0; i < trees.size(); i++)
+ {
+ distance_matrix[i*trees.size()+i] = 0;
+ }
+
+ return 0;
+}
+
+double gromov_graycode(double *m, size_t n, double *deltas, int scale)
+{
+ /* implements Knuth 7.2.1.3 Alg R (Revolving Door) */
+
+ unsigned c[] = {(unsigned)-1,0,1,2,3,(unsigned)n};
+ unsigned t = 4;
+ unsigned j;
+
+ double s[3];
+ double max = 0;
+
+ double a,b;
+ double raw_delta;
+
+ unsigned i = 0;
+
+R2:
+ s[0] = m[c[1]*n + c[2]] + m[c[3]*n + c[4]];
+ s[1] = m[c[1]*n + c[3]] + m[c[2]*n + c[4]];
+ s[2] = m[c[1]*n + c[4]] + m[c[2]*n + c[3]];
+
+ /* get the two largest */
+ if(s[0] >= s[1])
+ {
+ a = s[0];
+ if(s[1] >= s[2])
+ b = s[1];
+ else
+ b = s[1];
+ }
+ else
+ {
+ a = s[1];
+ if(s[2] >= s[0])
+ b = s[2];
+ else
+ b = s[1];
+ }
+
+ raw_delta = fabs(a-b);
+ switch(scale)
+ {
+ case 2:
+ raw_delta /= fmax(a,b);
+ break;
+ case 3:
+ double dd[4];
+ dd[0] = m[c[1]*n + c[4]] + m[c[1]*n + c[3]] + m[c[3]*n + c[4]];
+ dd[1] = m[c[1]*n + c[4]] + m[c[1]*n + c[2]] + m[c[2]*n + c[4]];
+ dd[2] = m[c[2]*n + c[3]] + m[c[3]*n + c[4]] + m[c[2]*n + c[4]];
+ dd[3] = m[c[1]*n + c[2]] + m[c[1]*n + c[3]] + m[c[2]*n + c[3]];
+
+ if(dd[0] >= dd[1] && dd[0] >= dd[2] && dd[0] >= dd[3])
+ raw_delta /= dd[0];
+ else if(dd[1] >= dd[0] && dd[1] >= dd[2] && dd[1] >= dd[3])
+ raw_delta /= dd[1];
+ else if(dd[2] >= dd[0] && dd[2] >= dd[1] && dd[2] >= dd[3])
+ raw_delta /= dd[2];
+ else
+ raw_delta /= dd[3];
+ break;
+ }
+
+ if(deltas)
+ {
+ deltas[i] = raw_delta;
+ }
+
+ if(raw_delta > max)
+ {
+ max = raw_delta;
+ }
+
+ i++;
+//R3: /* N.B. assumes t is even */
+ if(c[1] > 0)
+ {
+ c[1]--;
+ goto R2;
+ }
+ else
+ {
+ j = 2;
+ goto R5;
+ }
+R4:
+ if(c[j] >= j)
+ {
+ c[j] = c[j-1];
+ c[j-1] = j - 2;
+ goto R2;
+ }
+ else
+ {
+ j++;
+ }
+R5:
+ if(c[j] + 1 < c[j+1])
+ {
+ c[j-1] = c[j];
+ c[j]++;
+ goto R2;
+ }
+ else
+ {
+ j++;
+ if(j <= t)
+ {
+ goto R4;
+ }
+ }
+
+ return max/2.0;
+}
+
diff --git a/src/phylo.h b/src/phylo.h
new file mode 100644
index 0000000..4105f6d
--- /dev/null
+++ b/src/phylo.h
@@ -0,0 +1,100 @@
+#ifndef _PHYLO_H_
+#define _PHYLO_H_
+
+#include <string>
+#include <set>
+#include <map>
+#include <vector>
+#include <climits>
+
+using namespace std;
+
+typedef unsigned char stl_bool; // to prevent bit packing, which slows access time down considerably due to non-addressability of bits
+
+struct PhyEdge
+{
+ double weight;
+
+ unsigned int id; // used so we can identify edges without doing the set comparisons
+
+ vector<stl_bool> split; // downward == false, upward == true
+
+ PhyEdge(unsigned short leafcount)
+ {
+ /* this is a hack to add in the root node implicitly. it is added as an
+ * upward edge that is never assigned to. */
+ split.resize(leafcount+1, true);
+ }
+
+ bool operator==(const PhyEdge &rhs) const // for inter-tree comparisons
+ {
+ return (split == rhs.split); /* two edges are the same if their splits of leaves are the same */
+ }
+
+ bool operator<(const PhyEdge &rhs) const // FOR ORDERING WITHIN A SINGLE TREE ONLY
+ {
+ return id < rhs.id;
+ }
+
+ inline unsigned int SubsetRemainder(const PhyEdge &rhs) const
+ {
+ bool subset = true;
+
+ // check left subset
+ for(unsigned int i = 0; i < rhs.split.size(); i++)
+ {
+ if(rhs.split[i] == false && split[i] != false)
+ {
+ subset = false;
+ break;
+ }
+ }
+
+ if(subset)
+ {
+ // count # of lefts in this
+ int nl = 0;
+ for(unsigned int i = 0; i < split.size(); i++)
+ if(split[i] == false) nl++;
+ // count # of lefts in RHS
+ int nr = 0;
+ for(unsigned int i = 0; i < rhs.split.size(); i++)
+ if(rhs.split[i] == false) nr++;
+
+ return nl - nr;
+ }
+
+ // check right subset
+
+ subset = true; // reset flag
+ for(unsigned int i = 0; i < rhs.split.size(); i++)
+ {
+ if(rhs.split[i] == true && split[i] != false)
+ {
+ subset = false;
+ break;
+ }
+ }
+
+ if(subset)
+ {
+ // count # of lefts in this
+ int nl = 0;
+ for(unsigned int i = 0; i < split.size(); i++)
+ if(split[i] == false) nl++;
+ // count # of lefts in RHS
+ int nr = 0;
+ for(unsigned int i = 0; i < rhs.split.size(); i++)
+ if(rhs.split[i] == true) nr++;
+
+ return nl - nr;
+ }
+
+ return UINT_MAX;
+ }
+};
+
+typedef vector<PhyEdge> PhyEdgeSet;
+
+#endif /* _PHYLO_H_ */
+
diff --git a/src/treedist-stripped.cpp b/src/treedist-stripped.cpp
new file mode 100644
index 0000000..74f5d28
--- /dev/null
+++ b/src/treedist-stripped.cpp
@@ -0,0 +1,553 @@
+/*
+ * Polynomial Time Distance Algorithm for Binary Rooted Phylogenetic Trees
+ *
+ * Note: Algorithm based on presentation by Scott Provan / Megan Owen,
+ * "Computing Geodesic Distances in Tree Space in Polynomial Time"
+ *
+ * John Chakerian
+ * chakj at stanford.edu
+ *
+ * Last edited: Feb 28 2010
+ *
+ * Note that this is a stripped version because it includes only a built-in
+ * Edmonds-Karp network flow implementation; linear programming and Boost.Graph
+ * implementations are not provided, since they would introduce library
+ * dependencies.
+ *
+ */
+
+#include <iostream>
+#include <algorithm>
+#include <vector>
+#include <string>
+#include <stack>
+#include <queue>
+#include <iterator>
+#include <utility>
+
+#include <cmath>
+#include <cstdlib>
+#include <cassert>
+#include <cfloat>
+
+#include "treedist.h"
+
+#include "phylo.h"
+#include "newick.h"
+
+using namespace std;
+
+// Callback for clamping 0 or negative weights at a Very Small Value (that can
+// still be squared)
+void ClampNegativeWeights(PhyEdgeSet &a)
+{
+ for(unsigned int i = 0; i < a.size(); i++)
+ if(a[i].weight < sqrt(DBL_MIN))
+ a[i].weight = sqrt(DBL_MIN);
+}
+
+// Two edges are compatible if there is an intersection between the splits
+// that is empty.
+bool EdgesCompatible(PhyEdge &e1, PhyEdge &e2)
+{
+ // n.b. - left == false, right == true
+ bool n1 = true;
+ bool n2 = true;
+ bool n3 = true;
+ bool n4 = true;
+
+ for(unsigned int i = 0; i < e1.split.size(); i++)
+ {
+ // left/left
+ if(e1.split[i] == false && e2.split[i] == false)
+ n1 = false;
+ // right/right
+ if(e1.split[i] == true && e2.split[i] == true)
+ n2 = false;
+ // right/left
+ if(e1.split[i] == true && e2.split[i] == false)
+ n3 = false;
+ // left/right
+ if(e1.split[i] == false && e2.split[i] == true)
+ n4 = false;
+ }
+
+ return (n1 || n2 || n3 || n4);
+}
+
+double TreeDistance(PhyEdgeSet &a, PhyEdgeSet &b, stl_bool* incompatible)
+{
+ // 1. Separate out into shared edges & unique edges:
+ // a) foreach edge e in a:
+ // i) search b for any edge e' with a leaf set equivalent to e
+ // ii) if found, 'remove' e and e' and store as a pair in
+ // shared_edges
+ // b) all remaining edges are unique to each
+
+ assert(a.size() == b.size());
+
+ std::vector<pair<PhyEdge,PhyEdge> > shared_edges;
+ shared_edges.reserve(a.size());
+
+ vector<int> a_shared_idxs;
+ a_shared_idxs.reserve(a.size());
+
+ vector<int> b_shared_idxs;
+ b_shared_idxs.reserve(b.size());
+
+ for(unsigned int i = 0; i < a.size(); i++)
+ {
+ for(unsigned int j = 0; j < b.size(); j++)
+ {
+ if(a[i] == b[j])
+ {
+ shared_edges.push_back( make_pair(a[i], b[j]) );
+ a_shared_idxs.push_back(i);
+ b_shared_idxs.push_back(j);
+ break; // stop checking for this i
+ }
+ }
+ }
+
+ // The not-shared edges
+ PhyEdgeSet p,q;
+
+ for(unsigned int i = 0; i < a.size(); i++)
+ if(find(a_shared_idxs.begin(), a_shared_idxs.end(), i) == a_shared_idxs.end())
+ p.push_back(a[i]);
+
+ for(unsigned int j = 0; j < b.size(); j++)
+ if(find(b_shared_idxs.begin(), b_shared_idxs.end(), j) == b_shared_idxs.end())
+ q.push_back(b[j]);
+
+ std::vector<pair<PhyEdgeSet,PhyEdgeSet> > bins(shared_edges.size()+1); // last one is generic bin
+
+ // 1.5. For every unique edge, classify under the tightest shared edge,
+
+ // Classify left-tree edges
+ for(unsigned int i = 0; i < p.size(); i++)
+ {
+ unsigned int smallest_subset = UINT_MAX;
+ unsigned int edge_id = UINT_MAX;
+
+ for(unsigned int j = 0; j < shared_edges.size(); j++)
+ {
+ unsigned int r = shared_edges[j].first.SubsetRemainder(p[i]);
+ if(r < smallest_subset)
+ {
+ smallest_subset = r;
+ edge_id = j;
+ if(smallest_subset == 1) break;
+ }
+ }
+
+ if(edge_id == UINT_MAX) // dump it in the generic bin
+ bins[shared_edges.size()].first.push_back(p[i]);
+ else
+ bins[edge_id].first.push_back(p[i]);
+ }
+
+ // Classify right-tree edges
+ for(unsigned int i = 0; i < q.size(); i++)
+ {
+ unsigned int smallest_subset = UINT_MAX;
+ unsigned int edge_id = UINT_MAX;
+
+ for(unsigned int j = 0; j < shared_edges.size(); j++)
+ {
+ unsigned int r = shared_edges[j].second.SubsetRemainder(q[i]);
+ if(r < smallest_subset)
+ {
+ smallest_subset = r;
+ edge_id = j;
+ if(smallest_subset == 1) break;
+ }
+ }
+
+ if(edge_id == UINT_MAX) // dump it in the generic bin
+ bins[shared_edges.size()].second.push_back(q[i]);
+ else
+ bins[edge_id].second.push_back(q[i]);
+ }
+
+ // Precache all incompatibilities in a 'huge' logical array
+ // (this could be made into a bitmatrix later, if space is in demand)
+ // (this could also be upper-triangular, but again, space isn't a big deal
+ // at the moment)
+
+ // Keep in mind here that the IDs are _only_ valid within a certain tree.
+ // Thus the ordering always must be (left-hand tree edge , right-hand tree
+ // edge), and the matrix is NOT symmetric
+
+ unsigned int max_id = a.size();
+
+ for(unsigned int binid = 0; binid < bins.size(); binid++)
+ for(unsigned int i = 0; i < bins[binid].first.size(); i++)
+ for(unsigned int j = 0; j < bins[binid].second.size(); j++)
+ incompatible[bins[binid].first[i].id*max_id + bins[binid].second[j].id] = EdgesCompatible(bins[binid].first[i], bins[binid].second[j]);
+
+ // 2. For every shared edge bin, compute distance for all unique edges under
+ // it.
+
+ double unique_edge_factor = 0;
+ for(unsigned int i = 0; i < bins.size(); i++)
+ if(bins[i].first.size() > 0)
+ unique_edge_factor += DisjointTreeDistance(bins[i].first, bins[i].second, incompatible, max_id);
+
+ // 3. Calculate the final distance from both shared & unique edges
+ // a) the shared edges have Euclidean distance:
+
+ double shared_edge_factor = 0;
+ for(unsigned int i = 0; i < shared_edges.size(); i++)
+ {
+ double t = shared_edges[i].first.weight - shared_edges[i].second.weight;
+ shared_edge_factor += t*t;
+ }
+
+ // c) combine them together and return
+ return sqrt(unique_edge_factor + shared_edge_factor);
+}
+
+struct NetworkFlowResult
+{
+ double flow;
+ PhyEdgeSet A_i, B_i, A_i_prime, B_i_prime;
+};
+
+NetworkFlowResult EKNetworkFlow(PhyEdgeSet &A_i, PhyEdgeSet& B_i, stl_bool* incompatible, unsigned int max_id)
+{
+ // make these convenient, since they won't be changing
+ unsigned int ac = A_i.size();
+ unsigned int bc = B_i.size();
+
+ // compute square sums of A_i and B_i
+ double sumsq_A_i = 0;
+ for(unsigned int i = 0; i < ac; i++)
+ sumsq_A_i += A_i[i].weight*A_i[i].weight;
+
+ double sumsq_B_i = 0;
+ for(unsigned int j = 0; j < bc; j++)
+ sumsq_B_i += B_i[j].weight*B_i[j].weight;
+
+ // A_i first, then B_i, then source, then sink
+ unsigned int N = ac + bc + 2; // for offsets
+ unsigned int s = ac+bc; // source index
+ unsigned int t = ac+bc+1; // sink index
+
+ // set up the variables for Edmonds-Karp
+
+ // total flow
+ double f = 0;
+
+ double *F = new double[N*N]; // flow
+ double *C = new double[N*N]; // capacity
+
+ // set all the flow to zero
+ fill(F,F + N*N,0);
+ fill(C,C + N*N,0); // necessary?
+
+ // add capacities from source to A_i
+ for(unsigned int i = 0; i < A_i.size(); i++) // for (s,i)
+ {
+ C[N*s + i] = (A_i[i].weight*A_i[i].weight)/sumsq_A_i;
+ C[N*i + s] = 0;
+ }
+
+ // add capacities from B_i to sink
+ for(unsigned int j = 0; j < B_i.size(); j++) // for (j, t)
+ {
+ C[N*(ac+j) + t] = (B_i[j].weight*B_i[j].weight)/sumsq_B_i;
+ C[N*t + (ac+j)] = 0;
+ }
+
+ // add in incompatible edges
+ vector<pair<unsigned int,unsigned int> > incompatible_edge_pairs;
+ for(unsigned int i = 0; i < A_i.size(); i++)
+ {
+ for(unsigned int j = 0; j < B_i.size(); j++)
+ {
+ if(incompatible[A_i[i].id*max_id + B_i[j].id ]) // add (i,j) and (j,i)
+ {
+ C[N*i+(ac+j)] = DBL_MAX;
+ C[N*(ac+j)+i] = 0;
+ }
+ }
+ }
+
+ int *P = new int[N]; // path (reused)
+ double *M = new double[N]; // capacity to node
+
+ while(true)
+ {
+ // clean up old values, init the path
+ fill(P, P+N, -1);
+ fill(M, M+N, DBL_MAX);
+ P[s] = -2;
+
+ // do BFS to get a new path
+
+ std::queue<unsigned int> tc; // to-check
+ tc.push(s);
+
+ bool res = false; // make sure we get a result
+ while(!tc.empty())
+ {
+ unsigned int u = tc.front();
+ tc.pop();
+
+ // we need to get OUT-edges
+ // decide if we're on a A_i, B_i, s, or t
+ vector<unsigned int> E;
+ if(u == s)
+ {
+ // add the nodes in A_i
+ E.resize(ac,0);
+ for(unsigned int i = 0; i < ac; i++)
+ E[i] = i;
+ }
+ else if(u == t)
+ {
+ // add the nodes in B_i
+ E.resize(bc,0);
+ for(unsigned int i = 0; i < bc; i++)
+ E[i] = ac + i;
+ }
+ else if(u < ac) // then it is in A_i
+ {
+ // add the nodes in A_i
+ for(unsigned int j = 0; j < bc; j++)
+ if(incompatible[A_i[u].id*max_id + B_i[j].id ])
+ E.push_back(ac + j);
+ // add the source - don't think we actually need to do this
+ E.push_back(s);
+ }
+ else // then it is in B_i
+ {
+ // add the nodes in A_i
+ for(unsigned int j = 0; j < ac; j++)
+ if(incompatible[A_i[j].id*max_id + B_i[u-ac].id ])
+ E.push_back(j);
+ // add the sink
+ E.push_back(t);
+ }
+
+ // now iterate through E
+ for(unsigned int i = 0; i < E.size(); i++)
+ {
+ if(C[ N*u+E[i] ] - F[ N*u+E[i] ] > 0 && P[E[i]] == -1)
+ {
+ P[E[i]] = u;
+ M[E[i]] = min(M[u],C[ N*u+E[i] ] - F[ N*u+E[i] ]);
+ if(E[i] != t)
+ tc.push(E[i]);
+ else
+ {
+ // we're done with the BFS - empty the queue out
+ while(!tc.empty())
+ tc.pop();
+ res = true;
+ }
+ }
+ }
+ }
+
+ double m = 0;
+ if(res) m = M[t];
+
+ // check capacity of the path, break if 0 (no new path found)
+ if(m == 0)
+ break;
+
+ // add capacity of the path to flow
+ f += m;
+
+ // travel from sink to source along the path, updating edges with +/- cap
+ unsigned int v = t;
+ while(v != s)
+ {
+ unsigned int u = P[v];
+ F[N*u + v] += m;
+ F[N*v + u] -= m;
+ v = u;
+ }
+ }
+
+ NetworkFlowResult r;
+ r.flow = f;
+
+ if(f < 1 - DBL_EPSILON*100)
+ {
+ // do a BFS to get splits
+
+ // reset P
+ fill(P, P+N, -1);
+
+ std::queue<unsigned int> Q;
+ Q.push(s);
+
+ while(!Q.empty())
+ {
+ unsigned int u = Q.front();
+ Q.pop();
+
+ // we need to get OUT-edges
+ // decide if we're on a A_i, B_i, s, or t
+ vector<unsigned int> E;
+ if(u == s)
+ {
+ // add the nodes in A_i
+ E.resize(ac,0);
+ for(unsigned int i = 0; i < ac; i++)
+ E[i] = i;
+ }
+ else if(u == t)
+ {
+ // add the nodes in B_i
+ E.resize(bc,0);
+ for(unsigned int i = 0; i < bc; i++)
+ E[i] = ac + i;
+ }
+ else if(u < ac) // then it is in A_i
+ {
+ // add the nodes in A_i
+ for(unsigned int j = 0; j < bc; j++)
+ if(incompatible[ A_i[u].id*max_id + B_i[j].id ])
+ E.push_back(ac + j);
+ // add the source - don't think we actually need to do this
+ E.push_back(s);
+ }
+ else // then it is in B_i
+ {
+ // add the nodes in A_i
+ for(unsigned int j = 0; j < ac; j++)
+ if(incompatible[A_i[j].id*max_id + B_i[u-ac].id ])
+ E.push_back(j);
+ // add the sink
+ E.push_back(t);
+ }
+
+ // now iterate through E
+ for(unsigned int i = 0; i < E.size(); i++)
+ {
+ if(C[ N*u+E[i] ] - F[ N*u+E[i] ] > 0 && P[E[i]] == -1)
+ {
+ P[E[i]] = 1;
+ Q.push(E[i]);
+ }
+ }
+ }
+
+ // form our split vectors
+ for(unsigned int i = 0; i < N-2; i++) // ignore the last 2, since source/sink
+ {
+ if(i < ac) // in A_i
+ {
+ if(P[i] == 1)
+ r.A_i_prime.push_back(A_i[i]);
+ else
+ r.A_i.push_back(A_i[i]);
+ }
+ else // in B_i
+ {
+ if(P[i] == 1)
+ r.B_i_prime.push_back(B_i[i-ac]);
+ else
+ r.B_i.push_back(B_i[i-ac]);
+ }
+ }
+ }
+
+ delete [] C;
+ delete [] F;
+ delete [] P;
+ delete [] M;
+
+ return r;
+}
+
+double DisjointTreeDistance(PhyEdgeSet &a, PhyEdgeSet &b, stl_bool *incompatible, unsigned int max_id)
+{
+ // 2. For unique edges:
+ // a) initialize A_1 to all unique edges in a, B_1 to all unique edges in b
+ // b) for each edge e in A_i find all incompatible edges in b
+ // c) construct a bipartite graph based on these incompatible edges
+ // d) run max flow
+ // e) do BFS to find the min-weight vertex cover
+ // f) if min-weight vertex cover has weight < 1, split based on
+ // min-weight vertex cover and push blocks into the right places, else move to next A_i
+
+ stack<pair<PhyEdgeSet, PhyEdgeSet> > unchecked_blocks;
+ std::vector<pair<PhyEdgeSet, PhyEdgeSet> > finished_blocks; // NB blocks in reverse order
+
+ pair<PhyEdgeSet, PhyEdgeSet> p;
+ for(unsigned int i = 0; i < a.size(); i++)
+ p.first.push_back(a[i]);
+
+ for(unsigned int j = 0; j < b.size(); j++)
+ p.second.push_back(b[j]);
+
+ unchecked_blocks.push(p);
+
+ while(!unchecked_blocks.empty())
+ {
+ PhyEdgeSet A_i = unchecked_blocks.top().first;
+ PhyEdgeSet B_i = unchecked_blocks.top().second;
+ unchecked_blocks.pop();
+
+ // Some sanity checks
+ if(A_i.size() == 0 || B_i.size() == 0)
+ {
+ finished_blocks.push_back( make_pair(A_i, B_i) );
+ continue;
+ }
+
+ if(A_i.size() == 1 && B_i.size() == 1)
+ {
+ finished_blocks.push_back( make_pair(A_i, B_i) );
+ continue;
+ }
+
+ NetworkFlowResult r = EKNetworkFlow(A_i, B_i, incompatible, max_id);
+
+ if(r.flow < 1 - DBL_EPSILON*100)
+ {
+ unchecked_blocks.push( make_pair(r.A_i_prime, r.B_i_prime) );
+ unchecked_blocks.push( make_pair(r.A_i, r.B_i) );
+ }
+ else
+ {
+ finished_blocks.push_back( make_pair(A_i, B_i) );
+ }
+ }
+
+ // the unique edges have distance sqrt( sum_i (||A_i|| + ||B_i||)^2 )
+ double unique_edge_factor = 0;
+ for(unsigned int i = 0; i < finished_blocks.size(); i++)
+ {
+ double accum = 0;
+
+ PhyEdgeSet a = finished_blocks[i].first;
+ PhyEdgeSet b = finished_blocks[i].second;
+
+ // do the A_i block
+ double t = 0;
+ for(unsigned j = 0; j < a.size(); j++)
+ t += a[j].weight*a[j].weight;
+ t = sqrt(t);
+
+ accum += t;
+
+ // do the B_i block
+ t = 0;
+ for(unsigned j = 0; j < b.size(); j++)
+ t += b[j].weight*b[j].weight;
+ t = sqrt(t);
+
+ accum += t;
+
+ unique_edge_factor += accum*accum;
+ }
+
+ return unique_edge_factor; // n.b., not square rooted
+}
+
diff --git a/src/treedist.h b/src/treedist.h
new file mode 100644
index 0000000..d33c745
--- /dev/null
+++ b/src/treedist.h
@@ -0,0 +1,32 @@
+/*
+ * Polynomial Time Distance Algorithm for Binary Rooted Phylogenetic Trees
+ *
+ * Note: Algorithm based on presentation by Scott Provan / Megan Owen,
+ * "Computing Geodesic Distances in Tree Space in Polynomial Time"
+ *
+ * John Chakerian
+ * chakj at stanford.edu
+ *
+ */
+
+#ifndef __TREEDIST_H_
+#define __TREEDIST_H_
+
+#include <string>
+
+#include "phylo.h"
+#include "newick.h"
+
+double ConeDistance(PhyEdgeSet &a, PhyEdgeSet &b);
+double TreeDistance(PhyEdgeSet &a, PhyEdgeSet &b, stl_bool* incompatible);
+double DisjointTreeDistance(PhyEdgeSet &a, PhyEdgeSet &b, stl_bool* incompatible, unsigned int max_id);
+void ClampNegativeWeights(PhyEdgeSet &a);
+
+enum maxflow_enum { LP, EK, EKF, KOL, PR };
+extern maxflow_enum mf_method;
+enum metric_enum { GEODESIC, CONEPATH };
+extern metric_enum metric_method;
+
+string tolower_str(string in);
+
+#endif
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/r-cran-distory.git
More information about the debian-med-commit
mailing list