[med-svn] [r-cran-distory] 03/14: New upstream version 1.4.2

Andreas Tille tille at debian.org
Sat Sep 23 06:49:14 UTC 2017


This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository r-cran-distory.

commit 8b18df95df9ba040cc27d62665bf5b03864b0030
Author: Andreas Tille <tille at debian.org>
Date:   Sat Sep 23 07:15:07 2017 +0200

    New upstream version 1.4.2
---
 DESCRIPTION                  |  15 ++
 MD5                          |  26 ++
 NAMESPACE                    |   6 +
 R/bethe.tree.R               |  35 +++
 R/bin.multiPhylo.R           |  17 ++
 R/dist.multiPhylo.R          | 183 ++++++++++++++
 R/gromov.hyperbolicity.R     |  30 +++
 R/mcmc.target.seq.R          | 135 +++++++++++
 R/orthant.boundary.tree.R    |  32 +++
 R/phylo.diff.R               |  79 +++++++
 R/position.leverage.R        |  33 +++
 R/zzz.R                      |   5 +
 debian/changelog             |   5 -
 debian/compat                |   1 -
 debian/control               |  23 --
 debian/copyright             |  39 ---
 debian/rules                 |   3 -
 debian/source/format         |   1 -
 debian/watch                 |   2 -
 man/bethe.tree.Rd            |  56 +++++
 man/bin.multiPhylo.Rd        |  60 +++++
 man/dist.multiPhylo.Rd       | 122 ++++++++++
 man/distory-package.Rd       |  51 ++++
 man/gromov.hyperbolicity.Rd  |  75 ++++++
 man/mcmc.target.seq.Rd       | 136 +++++++++++
 man/orthant.boundary.tree.Rd |  73 ++++++
 man/phylo.diff.Rd            | 107 +++++++++
 man/position.leverage.Rd     |  71 ++++++
 src/newick.cpp               | 318 +++++++++++++++++++++++++
 src/newick.h                 |  19 ++
 src/phydist_r.cpp            | 383 ++++++++++++++++++++++++++++++
 src/phylo.h                  | 100 ++++++++
 src/treedist-stripped.cpp    | 553 +++++++++++++++++++++++++++++++++++++++++++
 src/treedist.h               |  32 +++
 34 files changed, 2752 insertions(+), 74 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
new file mode 100644
index 0000000..605d16a
--- /dev/null
+++ b/DESCRIPTION
@@ -0,0 +1,15 @@
+Package: distory
+Type: Package
+Title: Distance Between Phylogenetic Histories
+Version: 1.4.2
+Date: 2010-06-07
+Author: John Chakerian and Susan Holmes
+Maintainer: Susan Holmes <susan at stat.stanford.edu>
+Depends: ape (>= 2.3), R (>= 2.6.0)
+Description: Geodesic distance between phylogenetic trees and
+        associated functions.
+License: BSD
+Packaged: 2013-11-12 09:21:46 UTC; ripley
+Repository: CRAN
+Date/Publication: 2013-11-12 10:27:52
+NeedsCompilation: yes
diff --git a/MD5 b/MD5
new file mode 100644
index 0000000..9f37364
--- /dev/null
+++ b/MD5
@@ -0,0 +1,26 @@
+f168f781a67f6e7f7ab62ae647228db3 *DESCRIPTION
+26de99ccfb9ca8a2a213a4818be3b0bf *NAMESPACE
+850d5d746dff4c1e7e623fe7cd35b03a *R/bethe.tree.R
+a2ed747f17ee32505c78a67a2ab03945 *R/bin.multiPhylo.R
+2add15a328c7986ad1baf02a393239b7 *R/dist.multiPhylo.R
+cbd0f76fea9729eca45ed77e486e8513 *R/gromov.hyperbolicity.R
+4606b7200fc81f8a2c8204b170d8e19a *R/mcmc.target.seq.R
+056d8a5ee03dc5dd336ed03dd90357e7 *R/orthant.boundary.tree.R
+626d600d681b8f4cc1b842759126e3ee *R/phylo.diff.R
+b9984556b72472e92c3b298ddcd06df5 *R/position.leverage.R
+b645bf9091131f9aafecc84714c99776 *R/zzz.R
+d364424cfc58647da8ce0e9d709e829e *man/bethe.tree.Rd
+0bc572f4d938febf6e36a51583d17d7b *man/bin.multiPhylo.Rd
+326d3b4a951ffe4454a8266e70e6fd4d *man/dist.multiPhylo.Rd
+ef5f85add48282f9029861e4cd3c75de *man/distory-package.Rd
+65e6debd9d460f4cccabccb2843abd92 *man/gromov.hyperbolicity.Rd
+2adec96ee8efea3a6b2667bd9fdd0374 *man/mcmc.target.seq.Rd
+b81c0afa8545be62ea04abd94e74d081 *man/orthant.boundary.tree.Rd
+d679ae0bc472e781b887c1506f70d7bd *man/phylo.diff.Rd
+0d297ad815b52a39bfe52ca3075b8a32 *man/position.leverage.Rd
+f4d2b55dddfbaca90b778b3e1f45e536 *src/newick.cpp
+76960301ce979b7a7595211c7f3df496 *src/newick.h
+578f0107cf2171675d24aa978bc145e0 *src/phydist_r.cpp
+ab78adf7f6f16d7cbc09a1265f36b4e1 *src/phylo.h
+32800a06d5f51921d0ce2c1ba960bff1 *src/treedist-stripped.cpp
+16f8de5df9c6f8798a3e6f57495ec821 *src/treedist.h
diff --git a/NAMESPACE b/NAMESPACE
new file mode 100644
index 0000000..ce36d59
--- /dev/null
+++ b/NAMESPACE
@@ -0,0 +1,6 @@
+useDynLib("distory")
+
+# Export all names
+exportPattern(".")
+
+import(ape)
diff --git a/R/bethe.tree.R b/R/bethe.tree.R
new file mode 100644
index 0000000..d7ced3e
--- /dev/null
+++ b/R/bethe.tree.R
@@ -0,0 +1,35 @@
+bethe.tree <- function(tips, level.lengths = NULL, outgroup="O", outgroup.dist=1)
+{
+    if(length(tips) != length(unique(tips)))
+    {
+        stop("Not all tips are unique.\n")
+    }
+
+    if(!is.null(level.lengths))
+    {
+        if(length(level.lengths) == 1)
+        {
+            level.lengths = rep(level.lengths, 2)
+        }
+    }
+
+    if(is.null(level.lengths) || length(level.lengths) == 0)
+    {
+        level.lengths = 1
+    }
+
+    groupings <- as.numeric(sapply(1:(length(tips)/2), function(x) rep(x,2)))
+
+    d <- level.lengths[1]
+    level.lengths <- level.lengths[-1]
+    tips <- lapply(split(tips,groupings), function(x) { sprintf("(%s:%d,%s:%d)",x[1],d,x[2],d)} ) 
+    if(length(tips) == 1)
+    {
+        read.tree(text=sprintf("(%s:%d, %s:%d);", tips[[1]], d, outgroup, outgroup.dist))
+    }
+    else
+    {
+        bethe.tree(tips, level.lengths, outgroup, outgroup.dist)
+    }
+}
+
diff --git a/R/bin.multiPhylo.R b/R/bin.multiPhylo.R
new file mode 100644
index 0000000..804abc3
--- /dev/null
+++ b/R/bin.multiPhylo.R
@@ -0,0 +1,17 @@
+bin.multiPhylo <- function(treelist)
+{
+    edge.dists <- dist.multiPhylo(treelist, method="edgeset")
+    edge.dists <- as.matrix(edge.dists)
+
+    bin.id = 1
+    binning = rep(NA,length(treelist))
+    while(any(is.na(binning)))
+    {
+         # find the next unbinned tree & find all matching trees
+         next.tr = min(which(is.na(binning)))
+         binning[edge.dists[,next.tr] == 0] = bin.id
+         bin.id = bin.id + 1
+    }
+
+    binning
+}
diff --git a/R/dist.multiPhylo.R b/R/dist.multiPhylo.R
new file mode 100644
index 0000000..0abcf3b
--- /dev/null
+++ b/R/dist.multiPhylo.R
@@ -0,0 +1,183 @@
+dist.multiPhylo <- function(x, method="geodesic", force.multi2di = FALSE,
+        outgroup = NULL, convert.multifurcating = FALSE, use.random.resolution =
+        FALSE, scale = NULL, verbose = FALSE)
+{
+    if(length(x) < 2)
+        return(matrix())
+
+    if(class(x) == "multiPhylo") # ideally, we will have this
+    {
+        # run checks if appropriate
+        # separate it out into a vector of strings
+
+        if(!is.null(outgroup))
+        {
+            x <- lapply(x, function(k)
+                    {
+                    if(class(k) == "phylo")
+                    {
+                        if(!is.rooted(k))
+                            root(k, outgroup, resolve.root = TRUE)
+                        else k
+                    }
+                    else NA
+                    })
+        }
+
+
+        if(force.multi2di == TRUE)
+        {
+            x <- lapply(x, function(k) 
+                  { 
+                    if(class(k) == "phylo")
+                    {
+                    multi2di(k,random=use.random.resolution) 
+                    }
+                    else NA
+                  }) 
+        }
+        else if(convert.multifurcating == TRUE) # won't resolve multifurcations at the root
+        {
+            x <- lapply(x, function(k) 
+                  { 
+                    if(class(k) == "phylo")
+                    {
+                        if(!is.binary.tree(k))
+                            multi2di(k,random=use.random.resolution) 
+                        else k 
+                    }
+                    else NA
+                  }) 
+
+        }
+
+        if(!is.null(scale))
+        {
+            if(class(scale) == "phylo")
+            {
+                T <- sum(scale$edge.length)
+                x <- lapply(x, function(k)
+                        {
+                            if(class(k) == "phylo")
+                            {
+                                k$edge.length = k$edge.length * (T /
+                                    sum(k$edge.length))
+                                k
+                            }
+                            else NA
+                            })
+            }
+            else if(class(scale) == "numeric")
+            {
+                x <- lapply(x, function(k)
+                        {
+                            if(class(k) == "phylo")
+                            {
+                                k$edge.length = k$edge.length * (scale /
+                                    sum(k$edge.length))
+                                k
+                            }
+                            else NA
+                            })
+            }
+            else
+            {
+                stop("Scale parameter not understood.\n")
+            }
+        }
+
+        # do some sanity checks before we start out
+
+        r <- lapply(x, function(k) 
+                { 
+                if(class(k) == "phylo")
+                {
+                    is.rooted(k) 
+                }
+                else NA
+                })
+
+        if(!all(as.logical(r), na.rm=TRUE))
+        {
+            stop("Some trees are not rooted. Specify an outgroup to fix this problem. All trees must be rooted.\n")
+        }
+
+        r <- lapply(x, function(k) { if(class(k) == "phylo") is.binary.tree(k) else NA })
+        if(!all(as.logical(r), na.rm=TRUE))
+        {
+            stop("Some trees are not binary. All input trees must be strictly binary.\n")
+        }
+
+        # check to make sure all trees have the same tip labels
+        tips = x[[1]]$tip.label
+        r <- lapply(x, function(k) { if(class(k) == "phylo") setequal(k$tip.label,
+                    tips) else NA})
+
+        if(!all(as.logical(r), na.rm=TRUE))
+        {
+            stop("Not all trees have the same tips.\n")
+        }
+
+        # convert our list of class phylo to a list of strings
+        treestrs <- lapply(x, function(k) { if(class(k) == "phylo")
+                write.tree(k) else "" })
+        
+        method=tolower(method)
+
+        method.id = pmatch(method, c("edgeset", "geodesic"))
+
+        # call the C interface function and return the value automatically
+        if(method.id == 1)
+        {
+            rv <- .Call("phycpp_bin_trees", treestrs, PACKAGE="distory")
+        }
+        else if(method.id == 2)
+        {
+            rv <- .Call("phycpp_compute_tree_distance_set", treestrs, as.logical(verbose), PACKAGE="distory")
+        }
+        else
+        {
+            stop("Specified method is not valid")
+        }
+
+        as.dist(rv)
+    }
+
+    else if(typeof(x) == "list")
+    {
+        if(class(x[[1]]) == "phylo") # a list of phylo's that for some reason is not classed as multiPhylo
+        {
+            class(x) <- "multiPhylo" # it already is basically a multiPhylo anyways - we'll mark it as such
+            dist.multiPhylo(x, method=method, force.multi2di=force.multi2di, outgroup=outgroup,
+                        convert.multifurcating=convert.multifurcating,
+                        use.random.resolution=use.random.resolution,
+                        scale=scale, verbose=verbose)
+        }
+        else if(class(x[[1]]) == "character") # a list of strings, presuming one tree each, properly terminated
+        {
+            # read with /ape/, run checks, dump back
+            t <- paste(x, sep="", collapse="")
+            k <- read.tree(text=t)
+            dist.multiPhylo(x, method=method, force.multi2di=force.multi2di, outgroup=outgroup,
+                        convert.multifurcating=convert.multifurcating,
+                        use.random.resolution=use.random.resolution,
+                        scale=scale, verbose=verbose)
+        }
+    }
+    else if(class(x) == "character") # this is for one string containing multiple trees
+    {
+        # read with ape and dump back to a vector of strings
+        k <- read.tree(text=x)
+
+        # call this to process it properly
+        dist.multiPhylo(x, method=method, force.multi2di=force.multi2di, outgroup=outgroup,
+                        convert.multifurcating=convert.multifurcating,
+                        use.random.resolution=use.random.resolution,
+                        scale=scale, verbose=verbose)
+    }
+    else
+    {
+        stop("Cannot coerce the argument into a usable type.")
+    }
+}
+
diff --git a/R/gromov.hyperbolicity.R b/R/gromov.hyperbolicity.R
new file mode 100644
index 0000000..3e55659
--- /dev/null
+++ b/R/gromov.hyperbolicity.R
@@ -0,0 +1,30 @@
+gromov.hyperbolicity <- function(d, deltas = FALSE, scale = NA)
+{
+    d <- as.matrix(as.dist(d))
+    a = dim(d);
+    if(a[1] != a[2])
+    {
+        stop("The parameter could not be coerced into a square distance matrix.")
+    }
+
+    if(a[1] < 4)
+    {
+        stop("At least 4 points must be used to compute the Gromov hyperbolicity constant.")
+    }
+
+    if(is.na(scale))
+    {
+        scale = "none"
+    }
+
+    scaleV = pmatch(scale, c("none", "max", "perimeter"))
+
+    if(length(scaleV) > 1 || is.na(scaleV))
+    {
+        stop("Unknown or ambiguous scale method.");
+    }
+
+    .Call("gromov_distmatrix", as.double(d), as.logical(deltas),
+            as.integer(scaleV), PACKAGE="distory")
+}
+
diff --git a/R/mcmc.target.seq.R b/R/mcmc.target.seq.R
new file mode 100644
index 0000000..4a6295d
--- /dev/null
+++ b/R/mcmc.target.seq.R
@@ -0,0 +1,135 @@
+mcmc.target.seq <- function(data, x, F, n) 
+{
+    N = ncol(data)
+    D = rep(1, N)
+
+    likvals = c()
+
+    a = runif(n)
+
+    old.tree = F(lookup.samples(data, list(convert.table.to.idx(D)))[[1]])
+    old.dist = as.matrix(dist.multiPhylo(list(x, old.tree)))[1,2]
+
+    best.tree = old.tree
+    best.dist = old.dist
+
+    for(i in 1:n)
+    {
+        Dp = D
+        good = FALSE
+
+        r = sample.int(N, 2)
+        while(D[r[1]] == N || D[r[2]] == 0)
+        {    
+            r = sample.int(N, 2)
+        }
+
+        # propose the swap
+        Dp[r[1]] = Dp[r[1]] + 1;
+        Dp[r[2]] = Dp[r[2]] - 1;
+
+        new.tree = F(lookup.samples(data, list(convert.table.to.idx(Dp)))[[1]])
+        new.dist = as.matrix(dist.multiPhylo(list(x, new.tree)))[1,2]
+
+        v = old.dist/new.dist
+
+        if(v >= 1) 
+        { 
+            v = 1 
+            best.tree = new.tree
+            best.dist = new.dist
+        } 
+
+        if(v^(100-exp(1/i)) > a[i])
+        {
+            D = Dp
+            old.tree = new.tree
+            old.dist = new.dist
+        }
+
+        likvals = c(likvals, old.dist)
+    }
+    
+    list(data = D, tree = best.tree, distance = best.dist, vals = likvals)
+}
+
+# modified from ape code for boot.phylo
+boot.samples.idxs <- function(data, B = 100, block = 1)
+{
+    if (is.list(data) && !is.data.frame(data)) {
+        if (inherits(data, "DNAbin")) 
+            data <- as.matrix(data)
+        else {
+            nm <- names(data)
+            n <- length(data)
+            data <- unlist(data)
+            nL <- length(data)
+            data <- matrix(data, n, nL/n, byrow = TRUE)
+            rownames(data) <- nm
+        }
+    }
+    boot.smpls <- vector("list", B)
+    for (i in 1:B) {
+        if (block > 1) {
+            y <- seq(block, ncol(data), block)
+            boot.i <- sample(y, replace = TRUE)
+            boot.samp <- numeric(ncol(data))
+            boot.samp[y] <- boot.i
+            for (j in 1:(block - 1)) boot.samp[y - j] <- boot.i - 
+                j
+        }
+        else boot.samp <- sample(ncol(data), replace = TRUE)
+
+        boot.smpls[[i]] <- boot.samp
+    }
+
+    boot.smpls
+}
+
+lookup.samples <- function(data, idxs)
+{
+    if (is.list(data) && !is.data.frame(data)) {
+        if (inherits(data, "DNAbin")) 
+            data <- as.matrix(data)
+        else {
+            nm <- names(data)
+            n <- length(data)
+            data <- unlist(data)
+            nL <- length(data)
+            data <- matrix(data, n, nL/n, byrow = TRUE)
+            rownames(data) <- nm
+        }
+    }
+
+    lapply(idxs, function(i) { data[,i] } )
+}
+
+phylo.diff <- function(x, y, ...)
+{
+    uniqT1 <- distinct.edges(x, y)
+    uniqT2 <- distinct.edges(y, x)
+    
+    treeA.cs <- rep("black", dim(x$edge)[1]) 
+    treeA.cs[uniqT1] = "red"
+
+    treeB.cs <- rep("black", dim(y$edge)[1]) 
+    treeB.cs[uniqT2] = "red"
+
+    par(mfrow=c(1,2))
+    plot(x, edge.color=treeA.cs, ...)
+    plot(y, edge.color=treeB.cs, ...)
+
+    invisible()
+}
+
+convert.table.to.idx <- function(T)
+{
+    dat = c()
+    for(i in 1:length(T))
+    {
+        dat = c(dat, rep(i, T[i]))
+    }
+
+    dat
+}
+
diff --git a/R/orthant.boundary.tree.R b/R/orthant.boundary.tree.R
new file mode 100644
index 0000000..bcc7f94
--- /dev/null
+++ b/R/orthant.boundary.tree.R
@@ -0,0 +1,32 @@
+orthant.boundary.tree <- function(x, y)
+{
+    e1 <- distinct.edges(x,y)
+    e2 <- distinct.edges(y,x)
+
+    if(length(e1) != 1)
+        stop("Trees must differ by only one edge")
+
+    length.e1 = x$edge.length[e1]
+    length.e2 = y$edge.length[e2]
+
+    lambda = length.e1 / ( length.e1 + length.e2)
+
+    bdy.tree <- x
+
+    partitions <- partition.leaves(x)
+
+    bdy.tree$edge.length <- lapply(partitions,
+            function(e)
+            {
+                a.e <- x$edge.length[edge.from.split(x, e)]
+                b.e <- y$edge.length[edge.from.split(y, e)]
+
+                a.e*(1 - lambda) + b.e * lambda
+            }
+        )
+    
+    bdy.tree$edge.length[e1] = 0;
+
+    bdy.tree
+}
+
diff --git a/R/phylo.diff.R b/R/phylo.diff.R
new file mode 100644
index 0000000..fdb61b3
--- /dev/null
+++ b/R/phylo.diff.R
@@ -0,0 +1,79 @@
+phylo.diff <- function(x, y, ...)
+{
+    uniqT1 <- distinct.edges(x, y)
+    uniqT2 <- distinct.edges(y, x)
+    
+    treeA.cs <- rep("black", dim(x$edge)[1]) 
+    treeA.cs[uniqT1] = "red"
+
+    treeB.cs <- rep("black", dim(y$edge)[1]) 
+    treeB.cs[uniqT2] = "red"
+
+    par(mfrow=c(1,2))
+    plot(x, edge.color=treeA.cs, ...)
+    plot(y, edge.color=treeB.cs, ...)
+
+    invisible()
+}
+
+distinct.edges <- function(x, y) # all edges in x not in y
+{
+    bp1 <- partition.leaves(x)
+    bp1 <- lapply(bp1, sort)
+    bp2 <- partition.leaves(y)
+    bp2 <- lapply(bp2, sort)
+
+    p = c()
+
+    for(i in 1:length(bp1))
+    {
+        if(!(list(bp1[[i]]) %in% bp2))
+        {
+            p <- append(p, i)
+        }
+    }
+
+    p
+}
+
+edge.from.split <- function(x, split)
+{
+    splits <- partition.leaves(x)
+    splits <- lapply(splits, sort)
+    split <- sort(split)
+    match(list(split), splits)
+}
+
+get.bipartition <- function(x, e) # of leaves from edge
+{
+    acc = vector()
+    inc = FALSE;
+    for(i in 1:dim(x$edge)[1])
+    {
+        if(x$edge[i,1] == e)
+        {
+            acc <- append(acc, get.bipartition(x, x$edge[i,2]));
+            inc = TRUE;
+        }
+    }
+
+    if(!inc)
+    {
+        acc = x$tip.label[e];
+    }
+
+    acc
+}
+
+partition.leaves <- function(x) # get all bipartitoins
+{
+    r = list()
+    for(i in 1:dim(x$edge)[1])
+    {
+        t <- get.bipartition(x, x$edge[i,2]);
+        r <- append(r, list(t));
+    }
+
+    r
+}
+
diff --git a/R/position.leverage.R b/R/position.leverage.R
new file mode 100644
index 0000000..423e127
--- /dev/null
+++ b/R/position.leverage.R
@@ -0,0 +1,33 @@
+position.leverage <- function(data, F, to = NULL, rep = 50, by = 1)
+{
+    N = ncol(data)
+    D = rep(1, N)
+
+    leverage.vals = c()
+
+    if(is.null(to))
+    {
+        to = F(lookup.samples(data, list(convert.table.to.idx(D)))[[1]])
+    }
+
+    for(i in seq(1,N,by))
+    {
+        Dp = D;
+        S = sample(N, N/rep);
+        while(i %in% S)
+        {
+            S = sample(N,N/rep);
+        }
+
+        Dp[S] = 0;
+        Dp[i] = N/rep;
+
+        new.tree = F(lookup.samples(data, list(convert.table.to.idx(Dp)))[[1]])
+        leverage.vals = c(leverage.vals, as.matrix(dist.multiPhylo(list(to,
+                            new.tree)))[1,2])
+    }
+
+    leverage.vals
+}
+
+
diff --git a/R/zzz.R b/R/zzz.R
new file mode 100644
index 0000000..9391b00
--- /dev/null
+++ b/R/zzz.R
@@ -0,0 +1,5 @@
+.onUnload <- function(libpath)
+{
+    library.dynam.unload("distory", libpath)
+}
+
diff --git a/debian/changelog b/debian/changelog
deleted file mode 100644
index 419a463..0000000
--- a/debian/changelog
+++ /dev/null
@@ -1,5 +0,0 @@
-r-cran-distory (1.4.2-1) unstable; urgency=low
-
-  * Initial release (closes: #828857)
-
- -- Andreas Tille <tille at debian.org>  Tue, 28 Jun 2016 17:08:15 +0200
diff --git a/debian/compat b/debian/compat
deleted file mode 100644
index ec63514..0000000
--- a/debian/compat
+++ /dev/null
@@ -1 +0,0 @@
-9
diff --git a/debian/control b/debian/control
deleted file mode 100644
index 212a975..0000000
--- a/debian/control
+++ /dev/null
@@ -1,23 +0,0 @@
-Source: r-cran-distory
-Maintainer: Debian Med Packaging Team <debian-med-packaging at lists.alioth.debian.org>
-Uploaders: Andreas Tille <tille at debian.org>
-Section: gnu-r
-Priority: optional
-Build-Depends: debhelper (>= 9),
-               cdbs,
-               r-base-dev,
-               r-cran-ape
-Standards-Version: 3.9.8
-Vcs-Browser: https://anonscm.debian.org/viewvc/debian-med/trunk/packages/R/r-cran-distory/trunk/
-Vcs-Svn: svn://anonscm.debian.org/debian-med/trunk/packages/R/r-cran-distory/trunk/
-Homepage: https://cran.r-project.org/web/packages/distory
-
-Package: r-cran-distory
-Architecture: any
-Depends: ${shlibs:Depends},
-         ${misc:Depends},
-         ${R:Depends},
-         r-cran-ape
-Description: GNU R distance between phylogenetic histories
- This GNU R package enables calculation of geodesic distance between
- phylogenetic trees and associated functions.
diff --git a/debian/copyright b/debian/copyright
deleted file mode 100644
index 06636df..0000000
--- a/debian/copyright
+++ /dev/null
@@ -1,39 +0,0 @@
-Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
-Upstream-Name: distory
-Upstream-Contact: Susan Holmes <susan at stat.stanford.edu>
-Source: https://cran.r-project.org/web/packages/distory/index.html
-
-Files: *
-Copyright: 2012-2016 John Chakerian and Susan Holmes
-License: BSD-3-clause
-
-Files: debian/*
-Copyright: 2016 Andreas Tille <tille at debian.org>
-License: BSD-3-clause
-
-License: BSD-3-clause
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are
- met:
- .
-  1. Redistributions of source code must retain the above copyright
-     notice, this list of conditions and the following disclaimer.
-  2. Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the following disclaimer in
-     the documentation and/or other materials provided with the
-     distribution.
-  3. Neither the name of the copyright holder nor the names of its
-     contributors may be used to endorse or promote products derived
-     from this software without specific prior written permission.
- .
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
- IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
- TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
- PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/debian/rules b/debian/rules
deleted file mode 100755
index 2fbba2d..0000000
--- a/debian/rules
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/usr/bin/make -f
-
-include /usr/share/R/debian/r-cran.mk
diff --git a/debian/source/format b/debian/source/format
deleted file mode 100644
index 163aaf8..0000000
--- a/debian/source/format
+++ /dev/null
@@ -1 +0,0 @@
-3.0 (quilt)
diff --git a/debian/watch b/debian/watch
deleted file mode 100644
index c2be9eb..0000000
--- a/debian/watch
+++ /dev/null
@@ -1,2 +0,0 @@
-version=3
-http://cran.r-project.org/src/contrib/distory_([-\d.]*)\.tar\.gz
diff --git a/man/bethe.tree.Rd b/man/bethe.tree.Rd
new file mode 100644
index 0000000..0c17261
--- /dev/null
+++ b/man/bethe.tree.Rd
@@ -0,0 +1,56 @@
+\name{bethe.tree}
+\Rdversion{1.1}
+\alias{bethe.tree}
+\title{
+    Bethe Tree
+}
+
+\description{
+    Generates a Bethe tree with given tips, inner edge lengths, and outgroup.
+}
+
+\usage{
+    bethe.tree(tips, level.lengths = NULL, outgroup="O", outgroup.dist=1)
+}
+
+\arguments{
+  \item{tips}{
+      A list of tip names as a character vector. Should be a power of 2. All
+      tip names must be distinct.
+}
+ \item{level.lengths}{
+     Edge lengths for each level, counted from the bottom up. NULL means a
+     default of 1. If the vector isn't long enough, the last value will be
+     repeated as necessary.
+}
+ \item{outgroup}{
+     The tip label for the outgroup.
+}
+ \item{outgroup.dist}{
+     The distance of the outgroup from the root. 
+}
+}
+
+\details{
+    Generates a Bethe tree with specified internal edge lengths.
+}
+\value{
+    A class of type \code{phylo} representing the tree.
+}
+\author{
+    John Chakerian <chakj at stanford.edu>
+}
+\references{
+    Chakerian, J. and Holmes, S. P. Computational Tools for Evaluating
+    Phylogenetic and Heirarchical Clustering Trees.	arXiv:1006.1015v1.
+}
+
+\seealso{
+    \code{\link[distory]{dist.multiPhylo}}
+}
+
+\examples{
+
+plot(bethe.tree( as.character(1:16), 1:4, "17", 14)) 
+
+}
diff --git a/man/bin.multiPhylo.Rd b/man/bin.multiPhylo.Rd
new file mode 100644
index 0000000..d0926b1
--- /dev/null
+++ b/man/bin.multiPhylo.Rd
@@ -0,0 +1,60 @@
+\name{bin.multiPhylo}
+\Rdversion{1.1}
+\alias{bin.multiPhylo}
+\title{
+    Bin Trees
+}
+
+\description{
+    Bins trees according to branching topology. 
+}
+
+\usage{
+    bin.multiPhylo(treelist)
+}
+
+\arguments{
+  \item{treelist}{
+      A list of trees that can be passed to dist.phylo (see the help for
+      dist.phylo for acceptable formats)
+}
+}
+
+\details{
+   Bins trees according to branching topology. Two trees are considered to have
+   the same topology if the same set of partitions of tips are produced by
+   the edges, which corresponds to the same branching up to rearrangement
+   of tips.
+}
+\value{
+    Returns a numeric vector of bin ids. Bin ids are assigned in order of the
+    first tree in that bin, that is, the first k unique trees in the list
+    passed will be assigned bins 1..k in order of appearance. 
+}
+\author{
+    John Chakerian <chakj at stanford.edu>
+}
+\references{
+    Chakerian, J. and Holmes, S. P. Computational Tools for Evaluating
+    Phylogenetic and Heirarchical Clustering Trees.	arXiv:1006.1015v1.
+
+}
+
+\seealso{
+    \code{\link[distory]{dist.multiPhylo}}
+}
+
+\examples{
+
+data(woodmouse)
+otree <- root(fastme.ols(dist.dna(woodmouse)), "No305", resolve.root=TRUE)
+breps = 500
+
+trees <- boot.phylo(otree, woodmouse, B=breps, function(x)
+        root(fastme.ols(dist.dna(x)), "No305", resolve.root=TRUE),trees=TRUE)
+
+combined.trees <- c(list(otree), trees$trees)
+
+bin.multiPhylo(combined.trees)
+
+}
diff --git a/man/dist.multiPhylo.Rd b/man/dist.multiPhylo.Rd
new file mode 100644
index 0000000..662360b
--- /dev/null
+++ b/man/dist.multiPhylo.Rd
@@ -0,0 +1,122 @@
+\name{dist.multiPhylo}
+\Rdversion{1.1}
+\alias{dist.multiPhylo}
+\title{
+    Geodesic Distance Between Phylogenetic Trees
+}
+
+\description{
+    Computes the geodesic distance of a list of phylogenetic trees using a 
+    polynomial algorithm. 
+}
+
+\usage{
+    dist.multiPhylo(x, method="geodesic", force.multi2di = FALSE, outgroup = NULL, 
+    convert.multifurcating = FALSE, use.random.resolution = FALSE, scale =
+    NULL, verbose = FALSE)
+}
+
+\arguments{
+  \item{x}{
+      A list of ape trees (class 'phylo'). The list does not have to be of 
+      class 'multiPhylo'. The function will also accept a list of strings of 
+      trees in Newick format, or a single string with trees in Newick format
+      separated by semicolons. All the trees must have the same tip labels.
+}
+  \item{method}{
+      Determines which distance method is used. Options are 'geodesic' for the
+      tree space geodesic distance, or 'edgeset' for the number of edges
+      (defined by splits of tips) that are different.
+}
+  \item{force.multi2di}{
+      Force conversion of every tree to strict bifurcating through the ape 
+      function 'multi2di', using the use.random.resolution as its parameter.
+      This option should not be used in conjunction with specification of an
+      outgroup.
+}
+  \item{outgroup}{
+      Specifies an outgroup to root each tree with respect to. This calls the
+      ape function 'root' on every tree in the list.
+}
+  \item{convert.multifurcating}{
+      Setting this option will check every tree for multifurcations using the 
+      ape function 'is.binary.tree' - if it returns FALSE, the ape function
+      'multi2di' will be called on it. Note that this does not ensure a tree
+      is strictly binary, since ape considers an unrooted tree binary even if
+      the root node is trifurcating. This option can be used in conjunction
+      with specification of an outgroup.
+}
+
+  \item{use.random.resolution}{
+      Specifies the parameter to 'multi2di' if needed. 
+}
+  \item{scale}{
+      Specifies a scale to make all trees unformly scaled (that is, the sum of
+      all edges will be uniform)scale to make all trees unformly scaled (that 
+      is, the sum of all edge lengths will be uniform). The parameter can 
+      either be a tree of class \code{phylo} or a numeric value for the sum of 
+      all edge lengths.
+}
+  \item{verbose}{
+      Turns on incremental status updates and more warnings. Helpful for large 
+      computations.
+}
+
+}
+\details{
+    This function computes the geodesic distance according to Billera et. al.
+    using an algorithm based off of the polynomial time algorithm of Owen
+    and Provan. Since it corresponds to a formal definition of tree-space as
+    a space of strictly binary trees, no mulifurcations are allowed, including
+    on the root node. In addition, negative and 0-lengthed edges are clamped to
+    a very small value (DBL_MIN) for technical reasons. 
+
+    The Newick parser supports only a subset of the Newick format. In
+    particular, it does not at the moment allow for internal node labels, only
+    weights. Weights will be automatically set to 1 if not specified. It may be
+    necessary to clean data in ape to make the trees conform to this.
+}
+\value{
+    Returns a distance matrix of class 'dist' representing the pairwise
+    geodesic distances between all input trees. Keep in mind this distance
+    matrix is not Euclidean. N/A values are provided in the case of an error in
+    determining the distance.
+}
+\references{
+    Chakerian, J. and Holmes, S. P. Computational Tools for Evaluating
+    Phylogenetic and Heirarchical Clustering Trees.	arXiv:1006.1015v1.
+    
+    Billera, L. J., Holmes, S. P., and Vogtmann, K. (2001) Geometry of the space
+    of phylogenetic trees. _Adv. Appl. Math_, *27*, 733-767.
+
+    Megan Owen, J. Scott Provan, "A Fast Algorithm for Computing Geodesic
+    Distances in Tree Space," IEEE/ACM Transactions on Computational Biology
+    and Bioinformatics, 14 Jan. 2010.
+}
+\author{
+    John Chakerian <chakj at stanford.edu>
+}
+
+\seealso{
+    \code{\link[ape]{dist.dna}}
+    \code{\link[ape]{boot.phylo}}
+    \code{\link{cmdscale}} 
+}
+
+\examples{
+
+data(woodmouse)
+otree <- root(nj(dist.dna(woodmouse)), "No305", resolve.root=TRUE)
+breps = 250
+
+trees <- boot.phylo(otree, woodmouse, B=breps, function(x)
+        root(nj(dist.dna(x)), "No305", resolve.root=TRUE),trees=TRUE)
+
+combined.trees <- c(list(otree), trees$trees)
+tree.dists <- dist.multiPhylo(combined.trees)
+
+mdres <- cmdscale(tree.dists, k=breps, add=TRUE)
+plot(mdres$points[,1], mdres$points[,2], col = c("red", rep("black", breps)))
+text(mdres$points[,1], mdres$points[,2], ,labels=1:(breps+1), cex=0.7, adj=c(0,2))
+
+}
diff --git a/man/distory-package.Rd b/man/distory-package.Rd
new file mode 100644
index 0000000..3c4f234
--- /dev/null
+++ b/man/distory-package.Rd
@@ -0,0 +1,51 @@
+\name{distory-package}
+\Rdversion{1.1}
+\alias{distory-package}
+\alias{distory}
+\docType{package}
+\title{
+    Distance Between Phylogenetic Histories
+}
+\description{
+    The 'distory' package provides functions for computing geodesic distances
+    between phylogenetic trees, as well as functions which use this distance.
+    Methods for computing Gromov delta-hyperbolicity, Markov Chain Monte Carlo
+    routines in tree space, and per-position leverage for DNA sequences are
+    included.
+}
+\details{
+    A description of the algorithm used for the distance computation can be found
+    in help(dist.multiPhylo).
+}
+\author{
+    John Chakerian <chakj at stanford.edu> and Susan Holmes <susan at stat.stanford.edu>
+}
+\references{
+    Chakerian, J. and Holmes, S. P. Computational Tools for Evaluating
+    Phylogenetic and Heirarchical Clustering Trees.	arXiv:1006.1015v1. 
+
+    Billera, L. J., Holmes, S. P., and Vogtmann, K. (2001) Geometry of the space
+    of phylogenetic trees. _Adv. Appl. Math_, *27*, 733-767.
+
+    Megan Owen, J. Scott Provan, "A Fast Algorithm for Computing Geodesic
+    Distances in Tree Space," IEEE/ACM Transactions on Computational Biology
+    and Bioinformatics, 14 Jan. 2010.
+}
+\keyword{ package }
+\examples{
+
+data(woodmouse)
+otree <- root(nj(dist.dna(woodmouse)), "No305", resolve.root=TRUE)
+breps = 250
+
+trees <- boot.phylo(otree, woodmouse, B=breps, function(x)
+        root(nj(dist.dna(x)), "No305", resolve.root=TRUE),trees=TRUE)
+
+combined.trees <- c(list(otree), trees$trees)
+tree.dists <- dist.multiPhylo(combined.trees)
+
+mdres <- cmdscale(tree.dists, k=breps, add=TRUE)
+plot(mdres$points[,1], mdres$points[,2], col = c("red", rep("black", breps)))
+text(mdres$points[,1], mdres$points[,2], labels=1:(breps+1), cex=0.7, adj=c(0,2))
+
+}
diff --git a/man/gromov.hyperbolicity.Rd b/man/gromov.hyperbolicity.Rd
new file mode 100644
index 0000000..67600f6
--- /dev/null
+++ b/man/gromov.hyperbolicity.Rd
@@ -0,0 +1,75 @@
+\name{gromov.hyperbolicity}
+\Rdversion{1.1}
+\alias{gromov.hyperbolicity}
+\title{
+    Gromov Hyperbolicity Constant
+}
+
+\description{
+    Computes the Gromov Hyperbolicity Constant of a distance matrix.
+}
+
+\usage{
+    gromov.hyperbolicity(d, deltas = FALSE, scale = NA)
+}
+
+\arguments{
+  \item{d}{
+      A distance matrix of type \code{dist} or \code{matrix}, or anything that
+      can be coerced into \code{dist} by \code{as.dist}. Must have at least 4
+      points.
+}
+  \item{deltas}{
+      A logical value specifying whether to return the vector of delta values.
+      Default is \code{FALSE}.
+}
+  \item{scale}{
+     Specifies a scaling method for each delta. Default is no scaling (NA or
+     "none"). Available methods are "max" which scales deltas by the max of the
+     sums computed, and "perimeter" for the largest perimeter of the four points.
+}
+}
+
+\details{
+    This computes a constant that represents the relaxation of a 4-point
+    condition for delta-hyperbolicity. See (Gromov 1987) for details.
+}
+
+\value{
+    The Gromov hyperbolicity constant of the given distance matrix.
+}
+
+\author{
+    John Chakerian <chakj at stanford.edu>
+}
+
+\seealso{
+    \code{\link[distory]{dist.multiPhylo}}
+}
+
+\references{
+    M. Gromov. Hyperbolic groups. In Essays in Group Theory, pages 73-263.
+    Springer, New York, 1987.
+
+    Chakerian, J. and Holmes, S. P. Computational Tools for Evaluating
+    Phylogenetic and Heirarchical Clustering Trees.	arXiv:1006.1015v1. 
+}
+
+\examples{
+
+# scale final delta by max distance
+points <- cbind(runif(100), runif(100))
+d <- dist(points)
+gromov.hyperbolicity(d)/max(d)
+
+# scale each delta by max distance for the 4 points
+points <- cbind(runif(100), runif(100))
+d <- dist(points)
+gromov.hyperbolicity(d, scale="max")
+
+# scale each delta by the max perimeter for the 4 points
+points <- cbind(runif(100), runif(100))
+d <- dist(points)
+gromov.hyperbolicity(d, scale="max")
+
+}
diff --git a/man/mcmc.target.seq.Rd b/man/mcmc.target.seq.Rd
new file mode 100644
index 0000000..939540f
--- /dev/null
+++ b/man/mcmc.target.seq.Rd
@@ -0,0 +1,136 @@
+\name{mcmc.target.seq}
+\Rdversion{1.1}
+\alias{mcmc.target.seq}
+\alias{boot.samples.idxs}
+\alias{lookup.samples}
+\alias{convert.table.to.idx}
+
+\title{
+    Find MCMC Target Sequence
+}
+
+\description{
+    \code{mcmc.target.seq} uses MCMC to find a configuration of DNA positions
+    to get as close as possible to a given tree.
+
+    \code{boot.samples.idxs} bootstraps over indices into a DNA matrix.
+
+    \code{lookup.samples} goes from an index representation of a configuration
+    of DNA to the actual DNAbin format.
+
+    \code{convert.table.to.idx} converts a table of counts for positions 1..n
+    into a list of indices corresponding to positions (i.e. goes from the
+    tabled form to a vector whose tabling matches the input).
+}
+
+\usage{
+    mcmc.target.seq(data, x, F, n) 
+
+    boot.samples.idxs(data, B = 100, block = 1)
+
+    lookup.samples(data, idxs)
+
+    convert.table.to.idx(T)
+}
+
+\arguments{
+  \item{data}{
+      A DNA matrix in DNAbin format.
+}
+  \item{x}{
+      A tree of class 'phylo' to estimate.
+}
+  \item{F}{
+      A tree estimation function, accepting a DNA matrix in DNAbin format and
+      returning a tree of class 'phylo.'
+}
+  \item{n}{
+      The number of MCMC iterations to perform
+}
+ \item{B}{
+      The number of bootstrap replicates.
+}
+ \item{block}{
+      The block size to use during bootstrapping.
+}
+ \item{idxs}{
+     A list of numeric vectors of indices to use for lookup.
+}
+ \item{T}{
+     A table or table-like vector to convert.
+}
+}
+
+\details{
+    \code{mcmc.target.seq} performs an MCMC with simulated annealing to locate
+    a configuration of DNA positions from the original matrix that gets as
+    close as possible to a target tree. Propositions for the MCMC replacing
+    one character with another uniformly at random. 
+
+    The remaining functions are intended to be used as support functions.
+}
+\value{
+    \code{mcmc.target.seq} returns a list of 4 elements: a numeric vector of
+    counts of each position in the original matrix, the best estimated
+    tree, the best distance from the estimated tree to the target tree, and
+    a numeric vector of the distances for every iteration of the
+    simulation. 
+
+    \code{boot.samples.idxs} returns a numeric vector representing the
+    bootstrapped idices.
+
+    \code{lookup.samples} returns a list of objects of class DNAbin
+    corresponding to the DNA sequences generated from indices into the original
+    DNA matrix.
+
+    \code{convert.table.to.idx} returns a numeric vector of indices based on
+    the table counts.
+}
+\author{
+    John Chakerian <chakj at stanford.edu>
+}
+\references{
+    Chakerian, J. and Holmes, S. P. Computational Tools for Evaluating
+    Phylogenetic and Heirarchical Clustering Trees.	arXiv:1006.1015v1.
+}
+
+\seealso{
+    \code{\link[distory]{dist.multiPhylo}}
+    \code{\link[distory]{orthant.boundary.tree}}
+}
+
+\examples{
+
+data(woodmouse)
+otree <- root(fastme.ols(dist.dna(woodmouse)), "No305", resolve.root=TRUE)
+breps = 200
+
+trees <- boot.phylo(otree, woodmouse, B=breps, function(x)
+        root(fastme.ols(dist.dna(x)), "No305", resolve.root=TRUE),trees=TRUE)
+
+combined.trees <- c(list(otree), trees$trees)
+
+binning <- bin.multiPhylo(combined.trees)
+
+tree.a <- combined.trees[[match(1, binning)]]
+i = 2
+max.bin = max(binning)
+tree.b <- combined.trees[[match(2, binning)]]
+
+while(length(distinct.edges(tree.a,tree.b)) > 1 && i < max.bin)
+{
+    i = i + 1
+    tree.b = combined.trees[[match(i, binning)]]
+}
+
+bdy.tree <- orthant.boundary.tree(tree.a, tree.b)
+
+f.est <- function(x) root(nj(dist.dna(x)), "No305", resolve.root=TRUE)
+
+res <- mcmc.target.seq(woodmouse, bdy.tree, f.est, 1000)
+
+par(mfrow=c(2,1))
+plot(res$tree)
+plot(res$vals)
+
+}
diff --git a/man/orthant.boundary.tree.Rd b/man/orthant.boundary.tree.Rd
new file mode 100644
index 0000000..646b2b7
--- /dev/null
+++ b/man/orthant.boundary.tree.Rd
@@ -0,0 +1,73 @@
+\name{orthant.boundary.tree}
+\Rdversion{1.1}
+\alias{orthant.boundary.tree}
+\title{
+    Orthant Boundary Tree
+}
+
+\description{
+    Produces a degenerate tree on the boundary between trees that differ by one split.
+}
+
+\usage{
+    orthant.boundary.tree(x,y)
+}
+
+\arguments{
+  \item{x}{
+      The tree in the first orthant.
+}
+  \item{y}{
+      The tree in the second orthant.
+}
+}
+
+\details{
+    The tree found is the tree on the boundary between the two orthants such
+    that it is on the straight line connecting the two trees when one
+    orthant is thought of as being the (-,+) quadrant and the second
+    orthant as being the (+,+) quadrant, where the (0,y) line is the particular
+    boundary in question.
+}
+\value{
+    Returns an object of class 'phylo' representing the boundary tree.
+}
+\author{
+    John Chakerian <chakj at stanford.edu>
+}
+\references{
+    Chakerian, J. and Holmes, S. P. Computational Tools for Evaluating
+    Phylogenetic and Heirarchical Clustering Trees.	arXiv:1006.1015v1.
+}
+
+\seealso{
+    \code{\link[distory]{mcmc.target.seq}}
+}
+
+\examples{
+
+data(woodmouse)
+otree <- root(fastme.ols(dist.dna(woodmouse)), "No305", resolve.root=TRUE)
+breps = 200
+
+trees <- boot.phylo(otree, woodmouse, B=breps, function(x)
+        root(fastme.ols(dist.dna(x)), "No305", resolve.root=TRUE),trees=TRUE)
+
+combined.trees <- c(list(otree), trees$trees)
+
+binning <- bin.multiPhylo(combined.trees)
+
+tree.a <- combined.trees[[match(1, binning)]]
+i = 2
+max.bin = max(binning)
+tree.b <- combined.trees[[match(2, binning)]]
+
+while(length(distinct.edges(tree.a,tree.b)) > 1 && i < max.bin)
+{
+    i = i + 1
+    tree.b = combined.trees[[match(i, binning)]]
+}
+
+plot(orthant.boundary.tree(tree.a, tree.b))
+
+}
diff --git a/man/phylo.diff.Rd b/man/phylo.diff.Rd
new file mode 100644
index 0000000..ea07c69
--- /dev/null
+++ b/man/phylo.diff.Rd
@@ -0,0 +1,107 @@
+\name{phylo.diff}
+\Rdversion{1.1}
+\alias{phylo.diff}
+\alias{distinct.edges}
+\alias{edge.from.split}
+\alias{get.bipartition}
+\alias{partition.leaves}
+
+\title{
+    Differences Between Phylogenetic Trees
+}
+
+\description{
+    A family of functions for determining and plotting the differences between
+    two trees.
+
+    \code{phylo.diff} plots two trees side by side, highlighting edges unique
+    to each tree in red.
+
+    \code{distinct.edges} finds the edges present in the first argument not in
+    the second.
+
+    \code{edge.from.split} locates the edge id from a given split.
+
+    \code{get.bipartition} gets the bipartition of tips formed by a single edge.
+
+    \code{partition.leaves} returns the set of all bipartitions from all edges.
+}
+
+\usage{
+    phylo.diff(x, y, ...)
+
+    distinct.edges(x, y)
+
+    edge.from.split(x, split)
+
+    get.bipartition(x, e)
+
+    partition.leaves(x)
+
+}
+
+\arguments{
+  \item{x}{
+      The first (or only) tree.
+}
+  \item{y}{
+      The second tree, for the functions that accept two trees.
+}
+  \item{split}{
+      A list of bipartitions, probably from \code{partition.leaves}.
+}
+  \item{e}{
+      An edge for a particular tree, given as an id.
+}
+ \item{...}{
+      Additional arguments to pass to the \code{plot.phylo} function.
+}
+}
+
+\details{
+    \code{phylo.diff} uses the ape tree plotting function. The other functions
+    are mostly meant as support functions.
+}
+\value{
+    \code{phylo.diff} returns invisible.
+
+    \code{distinct.edges} returns a numeric vector of edge ids for the first
+    tree.
+
+    \code{edge.from.split} returns an edge id for a particular tree
+    corresponding to a given bipartition and NA if none such edge exists.
+
+    \code{get.bipartition} returns a character vector of the tips below that
+    edge in the given tree.
+
+    \code{partition.leaves} returns a list of partitions (themselves character
+    vectors) of the given tree.
+}
+\author{
+    John Chakerian <chakj at stanford.edu>
+}
+\references{
+    Chakerian, J. and Holmes, S. P. Computational Tools for Evaluating
+    Phylogenetic and Heirarchical Clustering Trees.	arXiv:1006.1015v1.
+}
+
+\seealso{
+    \code{\link[distory]{dist.multiPhylo}}
+}
+
+\examples{
+
+data(woodmouse)
+otree <- root(fastme.ols(dist.dna(woodmouse)), "No305", resolve.root=TRUE)
+breps = 10
+
+trees <- boot.phylo(otree, woodmouse, B=breps, function(x)
+        root(fastme.ols(dist.dna(x)), "No305", resolve.root=TRUE),trees=TRUE)
+
+combined.trees <- c(list(otree), trees$trees)
+
+binning <- bin.multiPhylo(combined.trees)
+
+phylo.diff(combined.trees[[match(1, binning)]], combined.trees[[match(2, binning)]])
+
+}
diff --git a/man/position.leverage.Rd b/man/position.leverage.Rd
new file mode 100644
index 0000000..abe7233
--- /dev/null
+++ b/man/position.leverage.Rd
@@ -0,0 +1,71 @@
+\name{position.leverage}
+\Rdversion{1.1}
+\alias{position.leverage}
+\title{
+    Position Leverage
+}
+
+\description{
+    Provides a rough heuristic for determining the degree to which each
+    position in the DNA matrix affects the resulting tree.
+}
+
+\usage{
+    position.leverage(data, F, to = NULL, rep = 50, by = 1)
+}
+
+\arguments{
+  \item{data}{
+      A DNA matrix in \code{DNAbin} format.
+}
+  \item{F}{
+      A tree estimation function, accepting a DNA matrix of class \code{DNAbin}
+      and returning a tree of class \code{phylo}
+}
+  \item{to}{
+      The tree with which distances are measured in respect to, or \code{NULL}
+      to indicate the tree estimated by \code{F} for the starting DNA matrix.
+}
+  \item{rep}{
+      The number of times to replicate the position in question.
+}
+  \item{by}{
+      The function will perform the calculation on every \code{by}-th position
+      (that is, on \code{ seq(1,N,by) } )
+}
+}
+
+\details{
+    This function takes a DNA matrix and, for every \code{by}-th position,
+    replicates that position \code{rep} times, randomly removing
+    \code{rep} other positions to keep all sequences the same length
+    other positions to keep all sequences the same length. For each new DNA
+    matrix created in this way, \code{F} is used to estimate the corresponding
+    tree, and the distance to tree \code{to} is computed and stored. This
+    distance can be thought of as somewhat analogous to the leverage of that
+    position.
+}
+\value{
+    Returns a numeric vector of distances from tree \code{to} for each position
+    sampled.
+}
+\author{
+    John Chakerian <chakj at stanford.edu>
+}
+\references{
+    Chakerian, J. and Holmes, S. P. Computational Tools for Evaluating
+    Phylogenetic and Heirarchical Clustering Trees.	arXiv:1006.1015v1. 
+}
+
+\seealso{
+    \code{\link[distory]{dist.multiPhylo}}
+}
+
+\examples{
+
+data(woodmouse)
+f.est <- function(x) root(nj(dist.dna(x)), "No305", resolve.root=TRUE)
+position.leverage(woodmouse, f.est, by=10)
+
+}
+
diff --git a/src/newick.cpp b/src/newick.cpp
new file mode 100644
index 0000000..f90fa11
--- /dev/null
+++ b/src/newick.cpp
@@ -0,0 +1,318 @@
+/* 
+ * Polynomial Time Distance Algorithm for Binary Rooted Phylogenetic Trees
+ *
+ * Note: Algorithm based on presentation by Scott Provan / Megan Owen,
+ *       "Computing Geodesic Distances in Tree Space in Polynomial Time"
+ * 
+ * John Chakerian
+ * chakj at stanford.edu
+ * 
+ * Last edited: Apr 22 2010
+ *
+ */
+
+#include <iostream>
+#include <algorithm>
+#include <cstdlib>
+#include <cctype>
+#include <set>
+#include <string>
+#include <vector>
+#include <stack>
+
+#include <R.h>
+
+#include "newick.h"
+
+// Optionally parse a floating point value after a ':' to indicate edge weight.
+// Sets the weight to 1 if no ':' is found.
+
+double ParseWeight(const string &str, unsigned int pos, unsigned int *off)
+{
+    double w = 1;
+    if(str[pos] == ':')
+    {
+        char* e;
+
+        string s = str.substr(pos+1);
+        const char* cs = s.c_str();
+
+        w = strtod(cs, &e);
+        if(cs == e)
+        {
+            w = 1;
+        }
+        else
+        {
+            pos += (e - cs)+1;
+        }
+    }
+
+    if(off)
+        *off = pos;
+
+    return w;
+}
+
+map<string,unsigned int> AssignLeafLabels(const string &str)
+{
+    unsigned int ctr = 0;
+    map<string, unsigned int> ret;
+
+    bool rec = false;
+    string t = "";
+    for(unsigned int i = 0; i < str.size(); i++)
+    {
+        if(str[i] == ' ') continue;
+
+        if(str[i] == '(' || str[i] == ',')
+        {
+            rec = true;
+            continue;
+        }
+
+        if(rec)
+        {
+            if(isalpha(str[i]) || isdigit(str[i]) || str[i] == '_' || str[i] == '-')
+                t += str[i];
+            else
+            {
+                ret[t] = ctr;
+                t = "";
+                ctr++;
+                rec = false;
+            }
+        }
+    }
+    
+    return ret;
+}
+
+// We'll use a simple state machine to parse over the trees. Technically this
+// isn't the best solution - a PDA would be more appropriate, however we're not
+// interested in a full parse, only the partitions caused by each edge. Because
+// of this, we can avoid recursion or any sort of tree data structure and just
+// go right to the partitions. 
+
+PhyEdgeSet NewickParse(const string &str, map<string,unsigned int> &name_to_pos)
+{
+    unsigned int pos = 0; // position in string
+    unsigned int ctr = 0; // nesting level
+    unsigned int id_ctr = 0; // for assigning each a unique id
+    string errstr;
+
+    vector< set<string> > in_leaves(30);
+    stack<int> commas; // for ensuring trees are bifurcating
+
+    vector<PhyEdge> edges;
+
+    if(str.length() == 0)
+    {
+        return vector<PhyEdge>();
+    }
+
+    // move past leading whitespace
+    while(str[pos] == ' ' || str[pos] == '\t')
+        pos++;
+
+    // we can bail out early here if we have an empty string or a ; character
+    if(str[pos] == ';')
+    {
+        return vector<PhyEdge>();
+    }
+
+    // if the last non-whitespace character isn't a ;, bail out
+    size_t tpos = str.length()-1;
+    while(tpos > 0 && isspace(str[tpos])) 
+        tpos--;
+
+    if(tpos == 0) // empty string
+    {
+        return vector<PhyEdge>();
+    }
+
+    if(tpos > 0 && str[tpos] != ';')
+    {
+        errstr = "Tree not terminated with ';'";
+        goto error;
+    }
+
+    goto start_state;
+
+start_state:
+    if(str[pos] == ';')
+        goto finish;
+    else if(str[pos] == '(')
+        goto new_nesting;
+    else
+    {
+        errstr = "Parse error at first character. A tree must have at least 2 taxa.";
+        goto error;
+    }
+
+new_nesting:
+    ctr++;
+    in_leaves.push_back(set<string>());
+    commas.push(0);
+
+    pos++; // move to first character of new nesting
+
+    // move past whitespace
+    while(str[pos] == ' ' || str[pos] == '\t')
+        pos++;
+
+    if(str[pos] == '(')
+        goto new_nesting; // start the process over again
+    else if(isalpha(str[pos]) || isdigit(str[pos]) || str[pos] == '_' || str[pos] == '-')
+        goto leaf_entry;
+    else if(str[pos] == ':')
+    {
+        errstr = "Leaf nodes must have string labels.";
+        goto error;
+    }
+    else if(str[pos] == ')')
+    {
+        errstr = "Empty blocks of form () are not allowed.";
+        goto error;
+    }
+    else
+    {
+        errstr = "Parse error after '('";
+        goto error;
+    }
+
+leaf_entry:
+    // move past whitespace
+    while(str[pos] == ' ' || str[pos] == '\t')
+        pos++;
+
+    // add the edge to the leaf
+    {
+        // get the leaf name
+        unsigned int spos = pos;
+        while(isalpha(str[pos]) || isdigit(str[pos]) || str[pos] == '_' || str[pos] == '-')
+        {
+            pos++;
+        }
+
+        string leaflabel = str.substr(spos,pos-spos);
+
+        PhyEdge e(name_to_pos.size());
+        e.id = id_ctr++;
+        double w = ParseWeight(str,pos,&pos);
+        e.weight = w;
+        e.split[ name_to_pos[leaflabel] ] = false;
+
+        edges.push_back(e);
+
+        for(unsigned int i = 0; i < in_leaves.size(); i++)
+        {
+            in_leaves[i].insert(leaflabel);
+        }
+    }
+
+    // move past extra whitespace
+    while(str[pos] == ' ' || str[pos] == '\t')
+        pos++;
+
+    // decide what to do next
+    if(str[pos] == ',')
+    {
+        commas.top()++;
+        if(commas.top() > 1)
+        {
+            errstr = "More than one comma detected in a nesting.";
+            goto error;
+        }
+        pos++;
+        // move past whitespace
+        while(str[pos] == ' ' || str[pos] == '\t')
+            pos++;
+
+        if(str[pos] == '(')
+            goto new_nesting;
+        else if(isdigit(str[pos]) || isalpha(str[pos]) || str[pos] == '_' || str[pos] == '-')
+            goto leaf_entry;
+        else
+        {
+            errstr = "Couldn't figure out what to do when moving past a ','.";
+            goto error;
+        }
+    }
+    else if(str[pos] == ')')
+        goto close_nesting;
+    else
+    {
+        errstr = "Couldn't figure out what to do after a first leaf entry ";
+        goto error;
+    }
+
+close_nesting:
+    if(str[pos] != ')')
+    {
+        errstr = "Close-nesting expected but no end-bracket found.";
+        goto error;
+    }
+    pos++;
+    if(str[pos] == ';')
+        goto finish;
+
+    // add the edge
+    {
+        PhyEdge e(name_to_pos.size());
+        e.id = id_ctr++;
+        e.weight = ParseWeight(str,pos,&pos);
+        
+        for(set<string>::iterator it = in_leaves[in_leaves.size()-1].begin(); it != in_leaves[in_leaves.size()-1].end(); ++it)
+            e.split[ name_to_pos[*it] ] = false;
+
+        edges.push_back(e);
+    }
+    in_leaves.pop_back();
+    commas.pop();
+
+    if(str[pos] == ';')
+        goto finish;
+
+    // decide what to do next
+    if(str[pos] == ',')
+    {
+        commas.top()++;
+        if(commas.top() > 1)
+        {
+            errstr = "More than one comma detected in a nesting. Trees MUST be binary.";
+            goto error;
+        }
+        pos++;
+        // move past whitespace
+        while(str[pos] == ' ' || str[pos] == '\t')
+            pos++;
+
+        if(str[pos] == '(')
+            goto new_nesting;
+        else if(isdigit(str[pos]) || isalpha(str[pos]) || str[pos] == '_' || str[pos] == '-')
+            goto leaf_entry;
+        else
+        {
+            errstr = "Couldn't figure out what to do when moving past a ','.";
+            goto error;
+        }
+    }
+    else if(str[pos] == ')')
+        goto close_nesting;
+    else
+    {
+        errstr = "Couldn't figure out what to do after a second leaf entry";
+        goto error;
+    }
+
+error:
+    if(errstr == "") errstr = "Parser ran off the edge.";
+    error("An error was encountered in parsing near position %d: %s\n", pos, errstr.c_str());
+
+    return vector<PhyEdge>();
+    
+finish:
+    return edges;
+}
+
diff --git a/src/newick.h b/src/newick.h
new file mode 100644
index 0000000..09705d8
--- /dev/null
+++ b/src/newick.h
@@ -0,0 +1,19 @@
+#ifndef _NEWICK_H_
+#define _NEWICK_H_
+
+#include <vector>
+#include <string>
+
+#include "phylo.h"
+
+using namespace std;
+
+/* Parse a single tree in Newick format */
+PhyEdgeSet NewickParse(const string &str, map<string,unsigned int> &name_to_pos);
+
+/* Use a single tree to assign each leaf label an integer ID to be used for all
+ * future parsing */
+map<string,unsigned int> AssignLeafLabels(const string &str); 
+
+#endif /* _NEWICK_H_ */
+
diff --git a/src/phydist_r.cpp b/src/phydist_r.cpp
new file mode 100644
index 0000000..1cc5871
--- /dev/null
+++ b/src/phydist_r.cpp
@@ -0,0 +1,383 @@
+/* 
+ * Polynomial Time Distance Algorithm for Binary Rooted Phylogenetic Trees
+ *
+ * Note: Algorithm based on presentation by Scott Provan / Megan Owen,
+ *       "Computing Geodesic Distances in Tree Space in Polynomial Time"
+ * 
+ * John Chakerian
+ * chakj at stanford.edu
+ *
+ * Last edited: Feb 28 2010
+ *
+ */
+
+#include <algorithm>
+#include <vector>
+#include <fstream>
+#include <iostream>
+
+#include "treedist.h"
+
+#include <R.h>
+#include <Rinternals.h>
+
+using namespace std;
+
+int compute_phylo_distance_matrix(vector<string> trees_in, bool verbose, double *m);
+void build_tree_list(std::vector<string> &trees_in, std::vector<PhyEdgeSet> &trees, bool verbose);
+double gromov_graycode(double *m, size_t n, double* deltas, int scale);
+
+inline bool edge_less_than(const PhyEdge &a, const PhyEdge &b)
+{
+    for(unsigned int i = 0; i < a.split.size(); i++)
+    {
+        if(a.split[i] < b.split[i])
+            return true;
+        if(b.split[i] < a.split[i])
+            return false;
+    }
+    // if we're here, they're equal, so a is NOT less than b.
+    return false;
+}
+
+inline int edgeset_difference(const PhyEdgeSet &a, const PhyEdgeSet &b)
+{
+    int sim = 0; 
+    for(unsigned int i = 0; i < a.size(); i++)
+    {
+        for(unsigned int j = 0; j < a.size(); j++)
+        {
+            if(a[i] == b[j])
+            {
+                sim++;
+                break;
+            }
+        }
+    }
+
+    return a.size() - sim;
+}
+
+extern "C"
+{
+    // returns a distance matrix of doubles
+    SEXP phycpp_compute_tree_distance_set(SEXP trees, SEXP verbose)
+    {
+        SEXP distmatrix;
+
+        bool be_verbose = asLogical(verbose);
+
+        // get number of trees
+        int len = length(trees);
+
+        vector<string> treevec(len);
+
+        // convert 'trees' to a vector of tree strings
+        for(int i = 0; i < len; i++)
+            treevec[i] = CHAR(STRING_ELT(VECTOR_ELT(trees,i),0));
+
+        PROTECT(distmatrix = allocMatrix(REALSXP,len, len));
+
+        compute_phylo_distance_matrix(treevec, be_verbose, REAL(distmatrix));
+
+        // replace -1 values with R N/A values
+        for(int i = 0; i < len*len; i++)
+            if(REAL(distmatrix)[i] == -1)
+                REAL(distmatrix)[i] = R_NaReal;
+
+        UNPROTECT(1);
+
+        return distmatrix;
+    }
+
+    SEXP multiset_diff_integer(SEXP set1, SEXP set2)
+    {
+        unsigned len1 = length(set1);
+        int *s1 = INTEGER(set1);
+        unsigned len2 = length(set2);
+        int *s2 = INTEGER(set2);
+
+        SEXP newset;
+        PROTECT(newset = allocVector(INTSXP, len1));
+        int *a = INTEGER(newset);
+
+        unsigned p=0;
+        for(unsigned i = 0; i < len1; i++)
+        {
+            bool keep = true;
+            for(unsigned j = 0; j < len2; j++)
+            {
+                if(s1[i] == s2[j])
+                {
+                    keep = false;
+                    break;
+                }
+            }
+
+            if(keep)
+                a[p++] = s1[i];
+        }
+
+        for(unsigned i = p; i < len1; i++)
+        {
+            a[i] = NA_INTEGER;
+        }
+
+        
+        UNPROTECT(1);
+        return newset;
+    }
+
+    // returns edge differences
+    SEXP phycpp_bin_trees(SEXP treelist)
+    {
+        // get number of trees
+        int len = length(treelist);
+
+        vector<string> treevec(len);
+
+        // convert 'trees' to a vector of tree strings
+        for(int i = 0; i < len; i++)
+            treevec[i] = CHAR(STRING_ELT(VECTOR_ELT(treelist,i),0));
+
+        std::vector<PhyEdgeSet> trees;
+        build_tree_list(treevec, trees, false);
+
+        SEXP distmatrix;
+        PROTECT(distmatrix = allocMatrix(REALSXP,len,len));
+        double *d = REAL(distmatrix);
+
+        unsigned int sz = trees.size();
+
+        // zero out diagonal
+        for(unsigned int i = 0; i < trees.size(); i++)
+            d[sz*i + i] = 0;
+
+        for(unsigned int i = 0; i < trees.size(); i++)
+        {
+            for(unsigned int j = i; j < trees.size(); j++)
+            {
+                int sim = edgeset_difference(trees[i], trees[j]);
+                d[sz*i + j] = sim;
+                d[sz*j + i] = sim;
+            }
+        }
+
+        UNPROTECT(1);
+
+        return distmatrix;
+    }
+
+    SEXP gromov_distmatrix(SEXP distmatrix, SEXP bDeltas, SEXP scale_method)
+    {
+        bool list_deltas = asLogical(bDeltas);
+        int scaleM = asInteger(scale_method);
+
+        unsigned len = length(distmatrix);
+        unsigned n = sqrt(static_cast<double>(len));
+        double *d = REAL(distmatrix);
+
+        SEXP g;
+
+        if(list_deltas)
+        {
+            PROTECT(g = allocVector(REALSXP, (n*(n-1)*(n-2)*(n-3))/(4*3*2)));
+            gromov_graycode(d, n, REAL(g), scaleM);
+            UNPROTECT(1);
+        }
+        else
+        {
+            PROTECT(g = allocVector(REALSXP, 1));
+            REAL(g)[0] = gromov_graycode(d, n, NULL, scaleM); 
+            UNPROTECT(1);
+        }
+
+        return g;
+    }
+}
+
+void build_tree_list(std::vector<string> &trees_in, std::vector<PhyEdgeSet> &trees, bool verbose)
+{
+    string t;
+
+    // build the leaf label LUT 
+    t = trees_in[0];
+    map<string,unsigned int> strtbl = AssignLeafLabels(t);
+
+    for(unsigned int treeno = 0; treeno < trees_in.size(); treeno++)
+    {
+        t = trees_in[treeno];
+
+        if(verbose)
+            Rprintf("Parsing tree %d\n", treeno);
+
+        PhyEdgeSet tr = NewickParse(t, strtbl);
+
+        ClampNegativeWeights(tr);
+        trees.push_back(tr);
+    }
+}
+
+int compute_phylo_distance_matrix(vector<string> trees_in, bool verbose, double *distance_matrix)
+{
+    std::vector<PhyEdgeSet> trees;
+    build_tree_list(trees_in, trees, verbose);
+
+    int ctr = 0;
+    int tot = 0.5 * (trees.size() * (trees.size() - 1));
+
+    // figure out how big the trees are
+    int k = -1;
+    while(trees[++k].size() == 0) ;
+
+    stl_bool *incompatibility_buffer = new stl_bool[trees[k].size()*trees[k].size()];
+
+    for(unsigned int j = 0; j < trees.size(); j++) // cols
+    {
+        for(unsigned int i = 0; i < j; i++) // rows
+        {
+            ctr++;
+            if(verbose)
+                Rprintf("%d/%d\t\t[%3.2f%%]\n", ctr,tot, (ctr/(double)tot)*100.0);
+
+            if(trees[i].size() == 0 || trees[j].size() == 0) // mark invalid trees with -1 so we can mark them as N/A later
+            {
+                distance_matrix[i*trees.size()+j] = -1;
+                distance_matrix[j*trees.size()+i] = -1;
+            }
+            else
+            {
+                double d = TreeDistance(trees[i],trees[j], incompatibility_buffer);
+                distance_matrix[i*trees.size()+j] = d;
+                distance_matrix[j*trees.size()+i] = d;
+            }
+
+        }
+    }
+
+    delete [] incompatibility_buffer;
+
+    // fill in the diagonal
+    for(unsigned int i = 0; i < trees.size(); i++)
+    {
+        distance_matrix[i*trees.size()+i] = 0;
+    }
+
+    return 0;
+}
+
+double gromov_graycode(double *m, size_t n, double *deltas, int scale) 
+{
+    /* implements Knuth 7.2.1.3 Alg R (Revolving Door) */
+
+    unsigned c[] = {(unsigned)-1,0,1,2,3,(unsigned)n}; 
+    unsigned t = 4; 
+    unsigned j; 
+
+    double s[3];
+    double max = 0;
+
+    double a,b;
+    double raw_delta;
+
+    unsigned i = 0;
+
+R2:
+    s[0] = m[c[1]*n + c[2]] + m[c[3]*n + c[4]];
+    s[1] = m[c[1]*n + c[3]] + m[c[2]*n + c[4]];
+    s[2] = m[c[1]*n + c[4]] + m[c[2]*n + c[3]];
+    
+    /* get the two largest */
+    if(s[0] >= s[1])
+    {
+        a = s[0];
+        if(s[1] >= s[2])
+            b = s[1];
+        else
+            b = s[1];
+    }
+    else
+    {
+        a = s[1];
+        if(s[2] >= s[0])
+            b = s[2];
+        else
+            b = s[1];
+    }
+
+    raw_delta = fabs(a-b);
+    switch(scale)
+    {
+        case 2:
+            raw_delta /= fmax(a,b);
+            break;
+        case 3:
+            double dd[4];
+            dd[0] = m[c[1]*n + c[4]] + m[c[1]*n + c[3]] + m[c[3]*n + c[4]];
+            dd[1] = m[c[1]*n + c[4]] + m[c[1]*n + c[2]] + m[c[2]*n + c[4]];
+            dd[2] = m[c[2]*n + c[3]] + m[c[3]*n + c[4]] + m[c[2]*n + c[4]];
+            dd[3] = m[c[1]*n + c[2]] + m[c[1]*n + c[3]] + m[c[2]*n + c[3]];
+
+            if(dd[0] >= dd[1] && dd[0] >= dd[2] && dd[0] >= dd[3])
+                raw_delta /= dd[0];
+            else if(dd[1] >= dd[0] && dd[1] >= dd[2] && dd[1] >= dd[3])
+                raw_delta /= dd[1];
+            else if(dd[2] >= dd[0] && dd[2] >= dd[1] && dd[2] >= dd[3])
+                raw_delta /= dd[2];
+            else
+                raw_delta /= dd[3];
+            break;
+    }
+
+    if(deltas)
+    {
+        deltas[i] = raw_delta;
+    }
+
+    if(raw_delta > max)
+    {
+        max = raw_delta;
+    }
+
+    i++;
+//R3: /* N.B. assumes t is even */
+    if(c[1] > 0) 
+    {
+        c[1]--;
+        goto R2;
+    }
+    else
+    {
+        j = 2;
+        goto R5;
+    }
+R4: 
+    if(c[j] >= j)
+    {
+        c[j] = c[j-1];
+        c[j-1] = j - 2;
+        goto R2;
+    }
+    else
+    {
+        j++;
+    }
+R5: 
+    if(c[j] + 1 < c[j+1])
+    {
+        c[j-1] = c[j];
+        c[j]++;
+        goto R2;
+    }
+    else
+    {
+        j++;
+        if(j <= t)
+        {
+            goto R4; 
+        }
+    }
+
+    return max/2.0;
+}
+
diff --git a/src/phylo.h b/src/phylo.h
new file mode 100644
index 0000000..4105f6d
--- /dev/null
+++ b/src/phylo.h
@@ -0,0 +1,100 @@
+#ifndef _PHYLO_H_
+#define _PHYLO_H_
+
+#include <string>
+#include <set>
+#include <map>
+#include <vector>
+#include <climits>
+
+using namespace std;
+
+typedef unsigned char stl_bool; // to prevent bit packing, which slows access time down considerably due to non-addressability of bits
+
+struct PhyEdge
+{
+    double weight;
+
+    unsigned int id; // used so we can identify edges without doing the set comparisons
+
+    vector<stl_bool> split; // downward == false, upward == true
+
+    PhyEdge(unsigned short leafcount) 
+    { 
+        /* this is a hack to add in the root node implicitly. it is added as an
+         * upward edge that is never assigned to. */
+        split.resize(leafcount+1, true);
+    }
+
+    bool operator==(const PhyEdge &rhs) const  // for inter-tree comparisons
+    {
+        return (split == rhs.split); /* two edges are the same if their splits of leaves are the same */
+    }
+
+    bool operator<(const PhyEdge &rhs) const // FOR ORDERING WITHIN A SINGLE TREE ONLY
+    {
+        return id < rhs.id;
+    }
+
+    inline unsigned int SubsetRemainder(const PhyEdge &rhs) const
+    {
+        bool subset = true;
+
+        // check left subset
+        for(unsigned int i = 0; i < rhs.split.size(); i++)
+        {
+            if(rhs.split[i] == false && split[i] != false)
+            {
+                subset = false;
+                break;
+            }
+        }
+
+        if(subset)
+        {
+            // count # of lefts in this 
+            int nl = 0;
+            for(unsigned int i = 0; i < split.size(); i++)
+                if(split[i] == false) nl++;
+            // count # of lefts in RHS
+            int nr = 0;
+            for(unsigned int i = 0; i < rhs.split.size(); i++)
+                if(rhs.split[i] == false) nr++;
+
+            return nl - nr;
+        }
+
+        // check right subset
+
+        subset = true; // reset flag
+        for(unsigned int i = 0; i < rhs.split.size(); i++)
+        {
+            if(rhs.split[i] == true && split[i] != false)
+            {
+                subset = false;
+                break;
+            }
+        }
+
+        if(subset)
+        {
+            // count # of lefts in this 
+            int nl = 0;
+            for(unsigned int i = 0; i < split.size(); i++)
+                if(split[i] == false) nl++;
+            // count # of lefts in RHS
+            int nr = 0;
+            for(unsigned int i = 0; i < rhs.split.size(); i++)
+                if(rhs.split[i] == true) nr++;
+
+            return nl - nr;
+        }
+
+        return UINT_MAX;
+    }
+};
+
+typedef vector<PhyEdge> PhyEdgeSet;
+
+#endif /* _PHYLO_H_ */
+
diff --git a/src/treedist-stripped.cpp b/src/treedist-stripped.cpp
new file mode 100644
index 0000000..74f5d28
--- /dev/null
+++ b/src/treedist-stripped.cpp
@@ -0,0 +1,553 @@
+/* 
+ * Polynomial Time Distance Algorithm for Binary Rooted Phylogenetic Trees
+ *
+ * Note: Algorithm based on presentation by Scott Provan / Megan Owen,
+ *       "Computing Geodesic Distances in Tree Space in Polynomial Time"
+ * 
+ * John Chakerian
+ * chakj at stanford.edu
+ *
+ * Last edited: Feb 28 2010
+ *
+ * Note that this is a stripped version because it includes only a built-in
+ * Edmonds-Karp network flow implementation; linear programming and Boost.Graph
+ * implementations are not provided, since they would introduce library 
+ * dependencies. 
+ *
+ */
+
+#include <iostream>
+#include <algorithm>
+#include <vector>
+#include <string>
+#include <stack>
+#include <queue>
+#include <iterator>
+#include <utility>
+
+#include <cmath>
+#include <cstdlib>
+#include <cassert>
+#include <cfloat>
+
+#include "treedist.h"
+
+#include "phylo.h"
+#include "newick.h"
+
+using namespace std;
+
+// Callback for clamping 0 or negative weights at a Very Small Value (that can
+// still be squared)
+void ClampNegativeWeights(PhyEdgeSet &a)
+{
+    for(unsigned int i = 0; i < a.size(); i++)
+        if(a[i].weight < sqrt(DBL_MIN))
+            a[i].weight = sqrt(DBL_MIN);
+}
+
+// Two edges are compatible if there is an intersection between the splits
+// that is empty.
+bool EdgesCompatible(PhyEdge &e1, PhyEdge &e2)
+{
+    // n.b. - left == false, right == true
+    bool n1 = true;
+    bool n2 = true;
+    bool n3 = true;
+    bool n4 = true;
+
+    for(unsigned int i = 0; i < e1.split.size(); i++)
+    {
+        // left/left
+        if(e1.split[i] == false && e2.split[i] == false)
+            n1 = false;
+        // right/right
+        if(e1.split[i] == true && e2.split[i] == true)
+            n2 = false;
+        // right/left
+        if(e1.split[i] == true && e2.split[i] == false)
+            n3 = false;
+        // left/right
+        if(e1.split[i] == false && e2.split[i] == true)
+            n4 = false;
+    }
+
+    return (n1 || n2 || n3 || n4);
+}
+
+double TreeDistance(PhyEdgeSet &a, PhyEdgeSet &b, stl_bool* incompatible)
+{
+    // 1. Separate out into shared edges & unique edges:
+    //     a) foreach edge e in a:
+    //          i) search b for any edge e' with a leaf set equivalent to e
+    //          ii) if found, 'remove' e and e' and store as a pair in
+    //              shared_edges
+    //     b) all remaining edges are unique to each
+    
+    assert(a.size() == b.size());
+ 
+    std::vector<pair<PhyEdge,PhyEdge> > shared_edges; 
+    shared_edges.reserve(a.size());
+
+    vector<int> a_shared_idxs;
+    a_shared_idxs.reserve(a.size());
+
+    vector<int> b_shared_idxs;
+    b_shared_idxs.reserve(b.size());
+
+    for(unsigned int i = 0; i < a.size(); i++)
+    {
+        for(unsigned int j = 0; j < b.size(); j++)
+        {
+            if(a[i] == b[j])
+            {
+                shared_edges.push_back( make_pair(a[i], b[j]) );
+                a_shared_idxs.push_back(i);
+                b_shared_idxs.push_back(j);
+                break; // stop checking for this i
+            }
+        }
+    }
+
+    // The not-shared edges
+    PhyEdgeSet p,q;
+
+    for(unsigned int i = 0; i < a.size(); i++)
+        if(find(a_shared_idxs.begin(), a_shared_idxs.end(), i) == a_shared_idxs.end())
+            p.push_back(a[i]);
+
+    for(unsigned int j = 0; j < b.size(); j++)
+        if(find(b_shared_idxs.begin(), b_shared_idxs.end(), j) == b_shared_idxs.end())
+            q.push_back(b[j]);
+
+    std::vector<pair<PhyEdgeSet,PhyEdgeSet> > bins(shared_edges.size()+1); // last one is generic bin
+
+    // 1.5. For every unique edge, classify under the tightest shared edge,
+
+    // Classify left-tree edges
+    for(unsigned int i = 0; i < p.size(); i++)
+    {
+        unsigned int smallest_subset = UINT_MAX;
+        unsigned int edge_id = UINT_MAX;
+
+        for(unsigned int j = 0; j < shared_edges.size(); j++)
+        {
+            unsigned int r = shared_edges[j].first.SubsetRemainder(p[i]);
+            if(r < smallest_subset)
+            {
+                smallest_subset = r;
+                edge_id = j;
+                if(smallest_subset == 1) break;
+            }
+        }
+
+        if(edge_id == UINT_MAX) // dump it in the generic bin
+            bins[shared_edges.size()].first.push_back(p[i]);
+        else
+            bins[edge_id].first.push_back(p[i]);
+    }
+
+    // Classify right-tree edges
+    for(unsigned int i = 0; i < q.size(); i++)
+    {
+        unsigned int smallest_subset = UINT_MAX;
+        unsigned int edge_id = UINT_MAX;
+
+        for(unsigned int j = 0; j < shared_edges.size(); j++)
+        {
+            unsigned int r = shared_edges[j].second.SubsetRemainder(q[i]);
+            if(r < smallest_subset)
+            {
+                smallest_subset = r;
+                edge_id = j;
+                if(smallest_subset == 1) break;
+            }
+        }
+
+        if(edge_id == UINT_MAX) // dump it in the generic bin
+            bins[shared_edges.size()].second.push_back(q[i]);
+        else
+            bins[edge_id].second.push_back(q[i]);
+    }
+
+    // Precache all incompatibilities in a 'huge' logical array
+    // (this could be made into a bitmatrix later, if space is in demand)
+    // (this could also be upper-triangular, but again, space isn't a big deal
+    // at the moment)
+   
+    // Keep in mind here that the IDs are _only_ valid within a certain tree.
+    // Thus the ordering always must be (left-hand tree edge , right-hand tree
+    // edge), and the matrix is NOT symmetric
+
+    unsigned int max_id = a.size(); 
+
+    for(unsigned int binid = 0; binid < bins.size(); binid++)
+        for(unsigned int i = 0; i < bins[binid].first.size(); i++)
+            for(unsigned int j = 0; j < bins[binid].second.size(); j++)
+                incompatible[bins[binid].first[i].id*max_id + bins[binid].second[j].id] = EdgesCompatible(bins[binid].first[i], bins[binid].second[j]);
+
+    // 2. For every shared edge bin, compute distance for all unique edges under
+    // it.
+    
+    double unique_edge_factor = 0;
+    for(unsigned int i = 0; i < bins.size(); i++)
+        if(bins[i].first.size() > 0)
+            unique_edge_factor += DisjointTreeDistance(bins[i].first, bins[i].second, incompatible, max_id);
+
+    // 3. Calculate the final distance from both shared & unique edges
+    // a) the shared edges have Euclidean distance:
+
+    double shared_edge_factor = 0;
+    for(unsigned int i = 0; i < shared_edges.size(); i++)
+    {
+        double t = shared_edges[i].first.weight - shared_edges[i].second.weight;
+        shared_edge_factor += t*t;
+    }
+
+    // c) combine them together and return
+    return sqrt(unique_edge_factor + shared_edge_factor);
+}
+
+struct NetworkFlowResult
+{
+    double flow;
+    PhyEdgeSet A_i, B_i, A_i_prime, B_i_prime;
+};
+
+NetworkFlowResult EKNetworkFlow(PhyEdgeSet &A_i, PhyEdgeSet& B_i, stl_bool* incompatible, unsigned int max_id) 
+{
+    // make these convenient, since they won't be changing
+    unsigned int ac = A_i.size(); 
+    unsigned int bc = B_i.size(); 
+
+    // compute square sums of A_i and B_i
+    double sumsq_A_i = 0;
+    for(unsigned int i = 0; i < ac; i++)
+        sumsq_A_i += A_i[i].weight*A_i[i].weight;
+
+    double sumsq_B_i = 0;
+    for(unsigned int j = 0; j < bc; j++)
+        sumsq_B_i += B_i[j].weight*B_i[j].weight;
+
+    // A_i first, then B_i, then source, then sink
+    unsigned int N = ac + bc + 2; // for offsets
+    unsigned int s = ac+bc;     // source index
+    unsigned int t = ac+bc+1;   // sink index
+
+    // set up the variables for Edmonds-Karp
+
+    // total flow
+    double f = 0;
+
+    double *F = new double[N*N]; // flow
+    double *C = new double[N*N]; // capacity
+
+    // set all the flow to zero
+    fill(F,F + N*N,0);
+    fill(C,C + N*N,0); // necessary?
+
+    // add capacities from source to A_i
+    for(unsigned int i = 0; i < A_i.size(); i++) // for (s,i)
+    {
+        C[N*s + i] = (A_i[i].weight*A_i[i].weight)/sumsq_A_i;
+        C[N*i + s] = 0;
+    }
+
+    // add capacities from B_i to sink
+    for(unsigned int j = 0; j < B_i.size(); j++) // for (j, t)
+    {
+        C[N*(ac+j) + t] = (B_i[j].weight*B_i[j].weight)/sumsq_B_i;
+        C[N*t + (ac+j)] = 0;
+    }
+
+    // add in incompatible edges
+    vector<pair<unsigned int,unsigned int> > incompatible_edge_pairs;
+    for(unsigned int i = 0; i < A_i.size(); i++)
+    {
+        for(unsigned int j = 0; j < B_i.size(); j++)
+        {
+            if(incompatible[A_i[i].id*max_id + B_i[j].id ]) // add (i,j) and (j,i)
+            {
+                C[N*i+(ac+j)] = DBL_MAX;
+                C[N*(ac+j)+i] = 0;
+            }
+        }
+    }
+
+    int *P = new int[N]; // path (reused)
+    double *M = new double[N]; // capacity to node
+
+    while(true)
+    {
+        // clean up old values, init the path
+        fill(P, P+N, -1);
+        fill(M, M+N, DBL_MAX);
+        P[s] = -2;
+
+        // do BFS to get a new path
+
+        std::queue<unsigned int> tc; // to-check
+        tc.push(s);
+
+        bool res = false; // make sure we get a result
+        while(!tc.empty())
+        {
+            unsigned int u = tc.front();
+            tc.pop();
+
+            // we need to get OUT-edges
+            // decide if we're on a A_i, B_i, s, or t
+            vector<unsigned int> E;
+            if(u == s)
+            {
+                // add the nodes in A_i
+                E.resize(ac,0);
+                for(unsigned int i = 0; i < ac; i++)
+                    E[i] = i;
+            }
+            else if(u == t)
+            {
+                // add the nodes in B_i
+                E.resize(bc,0);
+                for(unsigned int i = 0; i < bc; i++)
+                    E[i] = ac + i;
+            }
+            else if(u < ac) // then it is in A_i
+            {
+                // add the nodes in A_i
+                for(unsigned int j = 0; j < bc; j++)
+                    if(incompatible[A_i[u].id*max_id + B_i[j].id ])
+                        E.push_back(ac + j);
+                // add the source - don't think we actually need to do this
+                E.push_back(s);
+            }
+            else // then it is in B_i
+            {
+                // add the nodes in A_i
+                for(unsigned int j = 0; j < ac; j++)
+                    if(incompatible[A_i[j].id*max_id + B_i[u-ac].id ])
+                        E.push_back(j);
+                // add the sink
+                E.push_back(t);
+            }
+
+            // now iterate through E 
+            for(unsigned int i = 0; i < E.size(); i++)
+            {
+                if(C[ N*u+E[i] ] - F[ N*u+E[i] ] > 0 && P[E[i]] == -1)
+                {
+                    P[E[i]] = u;
+                    M[E[i]] = min(M[u],C[ N*u+E[i] ] - F[ N*u+E[i] ]);
+                    if(E[i] != t)
+                        tc.push(E[i]);           
+                    else
+                    {
+                        // we're done with the BFS - empty the queue out
+                        while(!tc.empty())
+                            tc.pop();
+                        res = true;
+                    }
+                }
+            }
+        }
+        
+        double m = 0;
+        if(res) m = M[t];
+
+        // check capacity of the path, break if 0 (no new path found)
+        if(m == 0)
+            break; 
+
+        // add capacity of the path to flow
+        f += m;
+
+        // travel from sink to source along the path, updating edges with +/- cap
+        unsigned int v = t;
+        while(v != s)
+        {
+            unsigned int u = P[v];
+            F[N*u + v] += m;
+            F[N*v + u] -= m;
+            v = u;
+        }
+    }
+
+    NetworkFlowResult r;
+    r.flow = f;
+
+    if(f < 1 - DBL_EPSILON*100)
+    {
+        // do a BFS to get splits
+
+        // reset P
+        fill(P, P+N, -1);
+
+        std::queue<unsigned int> Q;
+        Q.push(s);
+
+        while(!Q.empty())
+        {
+            unsigned int u = Q.front();
+            Q.pop();
+
+            // we need to get OUT-edges
+            // decide if we're on a A_i, B_i, s, or t
+            vector<unsigned int> E;
+            if(u == s)
+            {
+                // add the nodes in A_i
+                E.resize(ac,0);
+                for(unsigned int i = 0; i < ac; i++)
+                    E[i] = i;
+            }
+            else if(u == t)
+            {
+                // add the nodes in B_i
+                E.resize(bc,0);
+                for(unsigned int i = 0; i < bc; i++)
+                    E[i] = ac + i;
+            }
+            else if(u < ac) // then it is in A_i
+            {
+                // add the nodes in A_i
+                for(unsigned int j = 0; j < bc; j++)
+                    if(incompatible[ A_i[u].id*max_id + B_i[j].id ])
+                        E.push_back(ac + j);
+                // add the source - don't think we actually need to do this
+                E.push_back(s);
+            }
+            else // then it is in B_i
+            {
+                // add the nodes in A_i
+                for(unsigned int j = 0; j < ac; j++)
+                    if(incompatible[A_i[j].id*max_id + B_i[u-ac].id ])
+                        E.push_back(j);
+                // add the sink
+                E.push_back(t);
+            }
+
+            // now iterate through E 
+            for(unsigned int i = 0; i < E.size(); i++)
+            {
+                if(C[ N*u+E[i] ] - F[ N*u+E[i] ] > 0 && P[E[i]] == -1)
+                {
+                    P[E[i]] = 1;
+                    Q.push(E[i]); 
+                }
+            }
+        }
+
+        // form our split vectors
+        for(unsigned int i = 0; i < N-2; i++) // ignore the last 2, since source/sink
+        {
+            if(i < ac) // in A_i
+            {
+                if(P[i] == 1)
+                    r.A_i_prime.push_back(A_i[i]);
+                else
+                    r.A_i.push_back(A_i[i]);
+            }
+            else // in B_i
+            {
+                if(P[i] == 1)
+                    r.B_i_prime.push_back(B_i[i-ac]);
+                else
+                    r.B_i.push_back(B_i[i-ac]);
+            }
+        }
+    }
+
+    delete [] C;
+    delete [] F;
+    delete [] P;
+    delete [] M;
+
+    return r;
+}
+
+double DisjointTreeDistance(PhyEdgeSet &a, PhyEdgeSet &b, stl_bool *incompatible, unsigned int max_id)
+{
+    // 2. For unique edges:
+    //      a) initialize A_1 to all unique edges in a, B_1 to all unique edges in b
+    //      b) for each edge e in A_i find all incompatible edges in b
+    //      c) construct a bipartite graph based on these incompatible edges
+    //      d) run max flow
+    //      e) do BFS to find the min-weight vertex cover
+    //      f) if min-weight vertex cover has weight < 1, split based on
+    //          min-weight vertex cover and push blocks into the right places, else move to next A_i
+
+    stack<pair<PhyEdgeSet, PhyEdgeSet> > unchecked_blocks;
+    std::vector<pair<PhyEdgeSet, PhyEdgeSet> > finished_blocks; // NB blocks in reverse order
+
+    pair<PhyEdgeSet, PhyEdgeSet> p;
+    for(unsigned int i = 0; i < a.size(); i++)
+        p.first.push_back(a[i]);
+
+    for(unsigned int j = 0; j < b.size(); j++)
+        p.second.push_back(b[j]);
+
+    unchecked_blocks.push(p);
+
+    while(!unchecked_blocks.empty())
+    {
+        PhyEdgeSet A_i = unchecked_blocks.top().first;
+        PhyEdgeSet B_i = unchecked_blocks.top().second;
+        unchecked_blocks.pop();
+
+        // Some sanity checks
+        if(A_i.size() == 0 || B_i.size() == 0)
+        {
+            finished_blocks.push_back( make_pair(A_i, B_i) ); 
+            continue;
+        }
+
+        if(A_i.size() == 1 && B_i.size() == 1)
+        {
+            finished_blocks.push_back( make_pair(A_i, B_i) ); 
+            continue;
+        }
+
+        NetworkFlowResult r = EKNetworkFlow(A_i, B_i, incompatible, max_id);
+
+        if(r.flow < 1 - DBL_EPSILON*100) 
+        {
+            unchecked_blocks.push( make_pair(r.A_i_prime, r.B_i_prime) );
+            unchecked_blocks.push( make_pair(r.A_i, r.B_i) );
+        }
+        else
+        {
+            finished_blocks.push_back( make_pair(A_i, B_i) );
+        }
+    }
+
+    // the unique edges have distance sqrt( sum_i (||A_i|| + ||B_i||)^2 )
+    double unique_edge_factor = 0;
+    for(unsigned int i = 0; i < finished_blocks.size(); i++)
+    {
+        double accum = 0;
+
+        PhyEdgeSet a = finished_blocks[i].first;
+        PhyEdgeSet b = finished_blocks[i].second;
+
+        // do the A_i block
+        double t = 0;
+        for(unsigned j = 0; j < a.size(); j++)
+            t += a[j].weight*a[j].weight;
+        t = sqrt(t);
+
+        accum += t;
+
+        // do the B_i block
+        t = 0;
+        for(unsigned j = 0; j < b.size(); j++)
+            t += b[j].weight*b[j].weight;
+        t = sqrt(t);
+
+        accum += t;
+
+        unique_edge_factor += accum*accum;
+    }
+
+    return unique_edge_factor; // n.b., not square rooted
+}
+
diff --git a/src/treedist.h b/src/treedist.h
new file mode 100644
index 0000000..d33c745
--- /dev/null
+++ b/src/treedist.h
@@ -0,0 +1,32 @@
+/* 
+ * Polynomial Time Distance Algorithm for Binary Rooted Phylogenetic Trees
+ *
+ * Note: Algorithm based on presentation by Scott Provan / Megan Owen,
+ *       "Computing Geodesic Distances in Tree Space in Polynomial Time"
+ * 
+ * John Chakerian
+ * chakj at stanford.edu
+ *
+ */
+
+#ifndef __TREEDIST_H_
+#define __TREEDIST_H_
+
+#include <string>
+
+#include "phylo.h"
+#include "newick.h"
+
+double ConeDistance(PhyEdgeSet &a, PhyEdgeSet &b);
+double TreeDistance(PhyEdgeSet &a, PhyEdgeSet &b, stl_bool* incompatible);
+double DisjointTreeDistance(PhyEdgeSet &a, PhyEdgeSet &b, stl_bool* incompatible, unsigned int max_id);
+void ClampNegativeWeights(PhyEdgeSet &a);
+
+enum maxflow_enum { LP, EK, EKF, KOL, PR };
+extern maxflow_enum mf_method;
+enum metric_enum { GEODESIC, CONEPATH };
+extern metric_enum metric_method;
+
+string tolower_str(string in);
+
+#endif

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/r-cran-distory.git



More information about the debian-med-commit mailing list