[med-svn] [pvclust] 01/04: Imported Upstream version 1.3-0
Charles Plessy
plessy at moszumanska.debian.org
Thu Oct 9 01:49:24 UTC 2014
This is an automated email from the git hooks/post-receive script.
plessy pushed a commit to branch master
in repository pvclust.
commit b95003778ac0ce489d99ace6c95ce0458d552a56
Author: Charles Plessy <plessy at riken.jp>
Date: Thu Oct 9 09:59:04 2014 +0900
Imported Upstream version 1.3-0
---
DESCRIPTION | 15 +-
MD5 | 13 +
R/pvclust.R | 892 ++++++++++++++++++++++++++-------------------------
man/lung.Rd | 2 +
man/msfit.Rd | 2 +-
man/msplot.Rd | 2 +-
man/plot.pvclust.Rd | 2 +-
man/print.pvclust.Rd | 2 +-
man/pvclust.Rd | 411 ++++++++++++------------
man/pvpick.Rd | 2 +-
man/seplot.Rd | 2 +-
11 files changed, 689 insertions(+), 656 deletions(-)
diff --git a/DESCRIPTION b/DESCRIPTION
index 49fba41..77f54dd 100755
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,19 +1,20 @@
Package: pvclust
-Version: 1.2-2
-Date: 2011-04-13
+Version: 1.3-0
+Date: 2014-10-08
Title: Hierarchical Clustering with P-Values via Multiscale Bootstrap
Resampling
Author: Ryota Suzuki <suzuki at ef-prime.com>, Hidetoshi Shimodaira
- <shimo at is.titech.ac.jp>
+ <shimo at sigmath.es.osaka-u.ac.jp>
Maintainer: Ryota Suzuki <suzuki at ef-prime.com>
Depends: R (>= 2.10.0)
-Suggests: MASS, snow, Rmpi
+Suggests: MASS, parallel
Description: pvclust is a package for assessing the uncertainty in
hierarchical cluster analysis. It provides AU (approximately
unbiased) p-values as well as BP (boostrap probability) values
computed via multiscale bootstrap resampling.
License: GPL (>= 2)
-URL: http://www.is.titech.ac.jp/~shimo/prog/pvclust/
-Packaged: 2011-04-13 17:44:46 UTC; suzuki
+URL: http://www.sigmath.es.osaka-u.ac.jp/shimo-lab/prog/pvclust/
+Packaged: 2014-10-08 11:57:10 UTC; suzuki
+NeedsCompilation: no
Repository: CRAN
-Date/Publication: 2011-04-13 20:44:41
+Date/Publication: 2014-10-08 14:02:05
diff --git a/MD5 b/MD5
new file mode 100644
index 0000000..a1bdb57
--- /dev/null
+++ b/MD5
@@ -0,0 +1,13 @@
+751a6b3f35581b472f4e12f382c79822 *DESCRIPTION
+37c39c6efcd73c5db52c9f7d2d26301b *NAMESPACE
+b0aabf99eb6db36b0452ee07a612dbff *R/pvclust-internal.R
+73faece83dca117e5f5e7f58418d5c34 *R/pvclust.R
+4ce445fdf8be068ed0f770d3c7bafd17 *data/lung.RData
+d4137b4bb2fc7b26f40719391597a20a *man/lung.Rd
+305eedf26b855648f7456e9da01e5094 *man/msfit.Rd
+7fb48846f78962a79e748621b99ba738 *man/msplot.Rd
+9dfaa13e0b453115bd6191036f6a5473 *man/plot.pvclust.Rd
+3ccb94ceeb320278445c2be3fd41bcbb *man/print.pvclust.Rd
+10d182003a1d3ab8ab43b6a0f3759974 *man/pvclust.Rd
+cc1b3150c5eaedb0877afdfd2f385bed *man/pvpick.Rd
+c5bd20b8d802595502f2f6dc4ce989df *man/seplot.Rd
diff --git a/R/pvclust.R b/R/pvclust.R
index 5678e7c..a3f590a 100755
--- a/R/pvclust.R
+++ b/R/pvclust.R
@@ -1,440 +1,452 @@
-pvclust <- function(data, method.hclust="average",
- method.dist="correlation", use.cor="pairwise.complete.obs",
- nboot=1000, r=seq(.5,1.4,by=.1), store=FALSE, weight=FALSE)
- {
- # data: (n,p) matrix, n-samples, p-variables
- n <- nrow(data); p <- ncol(data)
-
- # hclust for original data
- METHODS <- c("ward", "single", "complete", "average", "mcquitty",
- "median", "centroid")
- method.hclust <- METHODS[pmatch(method.hclust, METHODS)]
- distance <- dist.pvclust(data, method=method.dist, use.cor=use.cor)
- data.hclust <- hclust(distance, method=method.hclust)
-
- # multiscale bootstrap
- size <- floor(n*r)
- rl <- length(size)
-
- if(rl == 1) {
- if(r != 1.0)
- warning("Relative sample size r is set to 1.0. AU p-values are not calculated\n")
-
- r <- list(1.0)
- }
- else
- r <- as.list(size/n)
-
- mboot <- lapply(r, boot.hclust, data=data, object.hclust=data.hclust, nboot=nboot,
- method.dist=method.dist, use.cor=use.cor,
- method.hclust=method.hclust, store=store, weight=weight)
-
- result <- pvclust.merge(data=data, object.hclust=data.hclust, mboot=mboot)
-
- return(result)
- }
-
-plot.pvclust <- function(x, print.pv=TRUE, print.num=TRUE, float=0.01,
- col.pv=c(2,3,8), cex.pv=0.8, font.pv=NULL,
- col=NULL, cex=NULL, font=NULL, lty=NULL, lwd=NULL,
- main=NULL, sub=NULL, xlab=NULL, ...)
-{
- if(is.null(main))
- main="Cluster dendrogram with AU/BP values (%)"
-
- if(is.null(sub))
- sub=paste("Cluster method: ", x$hclust$method, sep="")
-
- if(is.null(xlab))
- xlab=paste("Distance: ", x$hclust$dist.method)
-
- plot(x$hclust, main=main, sub=sub, xlab=xlab, col=col, cex=cex,
- font=font, lty=lty, lwd=lwd, ...)
-
- if(print.pv)
- text(x, col=col.pv, cex=cex.pv, font=font.pv, float=float, print.num=print.num)
-}
-
-text.pvclust <- function(x, col=c(2,3,8), print.num=TRUE, float=0.01, cex=NULL, font=NULL, ...)
-{
- axes <- hc2axes(x$hclust)
- usr <- par()$usr; wid <- usr[4] - usr[3]
- au <- as.character(round(x$edges[,"au"]*100))
- bp <- as.character(round(x$edges[,"bp"]*100))
- rn <- as.character(row.names(x$edges))
- au[length(au)] <- "au"
- bp[length(bp)] <- "bp"
- rn[length(rn)] <- "edge #"
- a <- text(x=axes[,1], y=axes[,2] + float * wid, au,
- col=col[1], pos=2, offset=.3, cex=cex, font=font)
- a <- text(x=axes[,1], y=axes[,2] + float * wid, bp,
- col=col[2], pos=4, offset=.3, cex=cex, font=font)
- if(print.num)
- a <- text(x=axes[,1], y=axes[,2], rn,
- col=col[3], pos=1, offset=.3, cex=cex, font=font)
-}
-
-print.pvclust <- function(x, which=NULL, digits=3, ...)
-{
- if(is.null(which)) which <- 1:nrow(x$edges)
- cat("\n")
- cat(paste("Cluster method: ", x$hclust$method, "\n", sep=""))
- cat(paste("Distance : ", x$hclust$dist.method, "\n\n", sep=""))
- cat("Estimates on edges:\n\n")
- print(round(x$edges[which,], digits=digits))
- cat("\n")
-}
-
-summary.pvclust <- function(object, ...){
- class(object) <- "list"
- summary(object, ...)
-}
-
-pvrect <- function(x, alpha=0.95, pv="au", type="geq", max.only=TRUE, border=2, ...)
- {
- len <- nrow(x$edges)
- member <- hc2split(x$hclust)$member
- order <- x$hclust$order
- usr <- par("usr")
- xwd <- usr[2] - usr[1]
- ywd <- usr[4] - usr[3]
- cin <- par()$cin
-
- ht <- c()
- j <- 1
-
- if(is.na(pm <- pmatch(type, c("geq", "leq", "gt", "lt"))))
- stop("Invalid type argument: see help(pvrect)")
-
- for(i in (len - 1):1)
- {
- if (pm==1) wh <- (x$edges[i,pv] >= alpha) # Greater than or EQuals
- else if(pm==2) wh <- (x$edges[i,pv] <= alpha) # Lower than or EQuals
- else if(pm==3) wh <- (x$edges[i,pv] > alpha) # Greater Than
- else if(pm==4) wh <- (x$edges[i,pv] > alpha) # Lower Than
-
- if(wh)
- {
- mi <- member[[i]]
- ma <- match(mi, order)
-
- if(max.only == FALSE || (max.only && sum(match(ma, ht, nomatch=0)) == 0))
- {
- xl <- min(ma)
- xr <- max(ma)
- yt <- x$hclust$height[i]
- yb <- usr[3]
-
- mx <- xwd / length(member) / 3
- my <- ywd / 200
-
- rect(xl - mx, yb + my, xr + mx, yt + my, border=border, shade=NULL, ...)
-
- j <- j + 1
- }
- ht <- c(ht, ma)
- }
- }
- }
-
-msplot <- function(x, edges=NULL, ...)
- {
- if(is.null(edges)) edges <- 1:length(x$msfit)
- d <- length(edges)
-
- mfrow.bak <- par()$mfrow
- on.exit(par(mfrow=mfrow.bak))
-
- par(mfrow=n2mfrow(d))
-
- for(i in edges) {
- if(i == 1 || (i %% 10 == 1 && i > 20))
- main <- paste(i, "st edge", sep="")
- else if(i == 2 || (i %% 10 == 2 && i > 20))
- main <- paste(i, "nd edge", sep="")
- else if(i == 3 || (i %% 10 == 3 && i > 20))
- main <- paste(i, "rd edge", sep="")
- else
- main <- paste(i, "th edge", sep="")
-
- plot(x$msfit[[i]], main=main, ...)
- }
- }
-
-lines.pvclust <- function(x, alpha=0.95, pv="au", type="geq", col=2, lwd=2, ...)
- {
- len <- nrow(x$edges)
- member <- hc2split(x$hclust)$member
- order <- x$hclust$order
- usr <- par("usr")
- xwd <- usr[2] - usr[1]
- ywd <- usr[4] - usr[3]
- cin <- par()$cin
-
- ht <- c()
- j <- 1
-
- if(is.na(pm <- pmatch(type, c("geq", "leq", "gt", "lt"))))
- stop("Invalid type argument: see help(lines.pvclust)")
-
- for(i in (len - 1):1)
- {
- if (pm==1) wh <- (x$edges[i,pv] >= alpha) # Greater than or EQuals
- else if(pm==2) wh <- (x$edges[i,pv] <= alpha) # Lower than or EQuals
- else if(pm==3) wh <- (x$edges[i,pv] > alpha) # Greater Than
- else if(pm==4) wh <- (x$edges[i,pv] > alpha) # Lower Than
-
- if(wh)
- {
- mi <- member[[i]]
- ma <- match(mi, order)
-
- if(sum(match(ma, ht, nomatch=0)) == 0)
- {
- xl <- min(ma)
- xr <- max(ma)
- yt <- x$hclust$height[i]
- yb <- usr[3]
-
- mx <- xwd/length(member)/10
-
- segments(xl-mx, yb, xr+mx, yb, xpd=TRUE, col=col, lwd=lwd, ...)
-
- j <- j + 1
- }
- ht <- c(ht, ma)
- }
- }
- }
-
-pvpick <- function(x, alpha=0.95, pv="au", type="geq", max.only=TRUE)
- {
- len <- nrow(x$edges)
- member <- hc2split(x$hclust)$member
- order <- x$hclust$order
-
- ht <- c()
- a <- list(clusters=list(), edges=c()); j <- 1
-
- if(is.na(pm <- pmatch(type, c("geq", "leq", "gt", "lt"))))
- stop("Invalid type argument: see help(pickup)")
-
- for(i in (len - 1):1)
- {
- if (pm==1) wh <- (x$edges[i,pv] >= alpha) # Greater than or Equals
- else if(pm==2) wh <- (x$edges[i,pv] <= alpha) # Lower than or Equals
- else if(pm==3) wh <- (x$edges[i,pv] > alpha) # Greater Than
- else if(pm==4) wh <- (x$edges[i,pv] > alpha) # Lower Than
-
- if(wh)
- {
- mi <- member[[i]]
- ma <- match(mi, order)
-
- if(max.only == FALSE || (max.only && sum(match(ma, ht, nomatch=0)) == 0))
- {
- a$clusters[[j]] <- x$hclust$labels[mi]
- a$edges <- c(a$edges,i)
-
- j <- j + 1
- }
- ht <- c(ht, ma)
- }
- }
-
- a$edges <- a$edges[length(a$edges):1]
- a$clusters <- a$clusters[length(a$edges):1]
-
- return(a)
- }
-
-parPvclust <- function(cl, data, method.hclust="average",
- method.dist="correlation", use.cor="pairwise.complete.obs",
- nboot=1000, r=seq(.5,1.4,by=.1), store=FALSE,
- weight=FALSE,
- init.rand=TRUE, seed=NULL)
- {
- if(!(require(snow))) stop("Package snow is required for parPvclust.")
-
- if((ncl <- length(cl)) < 2 || ncl > nboot) {
- warning("Too small value for nboot: non-parallel version is executed.")
- return(pvclust(data,method.hclust,method.dist,use.cor,nboot,r,store))
- }
-
- if(init.rand) {
- if(is.null(seed))
- seed <- 1:length(cl)
- else if(length(seed) != length(cl))
- stop("seed and cl should have the same length.")
-
- # setting random seeds
- parLapply(cl, as.list(seed), set.seed)
- }
-
- # data: (n,p) matrix, n-samples, p-variables
- n <- nrow(data); p <- ncol(data)
-
- # hclust for original data
- METHODS <- c("ward", "single", "complete", "average", "mcquitty",
- "median", "centroid")
- method.hclust <- METHODS[pmatch(method.hclust, METHODS)]
- distance <- dist.pvclust(data, method=method.dist, use.cor=use.cor)
- data.hclust <- hclust(distance, method=method.hclust)
-
- # multiscale bootstrap
- size <- floor(n*r)
- rl <- length(size)
-
- if(rl == 1) {
- if(r != 1.0)
- warning("Relative sample size r is set to 1.0. AU p-values are not calculated\n")
-
- r <- list(1.0)
- }
- else
- r <- as.list(size/n)
-
- nbl <- as.list(rep(nboot %/% ncl,times=ncl))
-
- if((rem <- nboot %% ncl) > 0)
- nbl[1:rem] <- lapply(nbl[1:rem], "+", 1)
-
- cat("Multiscale bootstrap... ")
-
- mlist <- parLapply(cl, nbl, pvclust.node,
- r=r, data=data, object.hclust=data.hclust, method.dist=method.dist,
- use.cor=use.cor, method.hclust=method.hclust,
- store=store, weight=weight)
- cat("Done.\n")
-
- mboot <- mlist[[1]]
-
- for(i in 2:ncl) {
- for(j in 1:rl) {
- mboot[[j]]$edges.cnt <- mboot[[j]]$edges.cnt + mlist[[i]][[j]]$edges.cnt
- mboot[[j]]$nboot <- mboot[[j]]$nboot + mlist[[i]][[j]]$nboot
- mboot[[j]]$store <- c(mboot[[j]]$store, mlist[[i]][[j]]$store)
- }
- }
-
- result <- pvclust.merge( data=data, object.hclust=data.hclust, mboot=mboot)
-
- return(result)
- }
-
-msfit <- function(bp, r, nboot) {
-
- if(length(bp) != length(r))
- stop("bp and r should have the same length")
-
- nboot <- rep(nboot, length=length(bp))
-
- use <- bp > 0 & bp < 1
-
- p <- se <- c(0,0); names(p) <- names(se) <- c("au", "bp")
- coef <- c(0,0); names(coef) <- c("v", "c")
-
- a <- list(p=p, se=se, coef=coef, df=0, rss=0, pchi=0); class(a) <- "msfit"
-
- if(sum(use) < 2) {
- # if(mean(bp) < .5) a$p[] <- c(0, 0) else a$p[] <- c(1, 1)
- if(mean(bp) < .5) a$p[] <- c(0, bp[r==1.0]) else a$p[] <- c(1, bp[r==1.0])
- return(a)
- }
-
- bp <- bp[use]; r <- r[use]; nboot <- nboot[use]
- zz <- -qnorm(bp)
- vv <- ((1 - bp) * bp) / (dnorm(zz)^2 * nboot)
- a$use <- use; a$r <- r; a$zz <- zz
-
- X <- cbind(sqrt(r), 1/sqrt(r)); dimnames(X) <- list(NULL, c("v","c"))
- fit <- lsfit(X, zz, 1/vv, intercept=FALSE)
- a$coef <- coef <- fit$coef
-
- h.au <- c(1, -1); h.bp <- c(1, 1)
-
- z.au <- drop(h.au %*% coef); z.bp <- drop(h.bp %*% coef)
- a$p["au"] <- pnorm(-z.au); a$p["bp"] <- pnorm(-z.bp)
- V <- solve(crossprod(X, X/vv))
- vz.au <- drop(h.au %*% V %*% h.au); vz.bp <- drop(h.bp %*% V %*% h.bp)
- a$se["au"] <- dnorm(z.au) * sqrt(vz.au); a$se["bp"] <- dnorm(z.bp) * sqrt(vz.bp)
- a$rss <- sum(fit$residual^2/vv)
-
- if((a$df <- sum(use) - 2) > 0) {
- a$pchi <- pchisq(a$rss, lower.tail=FALSE, df=a$df)
- }
- else a$pchi <- 1.0
-
- return(a)
-}
-
-plot.msfit <- function(x, curve=TRUE, main=NULL, sub=NULL, xlab=NULL, ylab=NULL, ...)
-{
- if(is.null(main)) main="Curve fitting for multiscale bootstrap resampling"
- if(is.null(sub))
- {
- sub <- paste("AU = ", round(x$p["au"], digits=2),
- ", BP = ", round(x$p["bp"], digits=2),
- ", v = ", round(x$coef["v"], digits=2),
- ", c = ", round(x$coef["c"], digits=2),
- ", pchi = ", round(x$pchi, digits=2))
- }
- if(is.null(xlab)) xlab=expression(sqrt(r))
- if(is.null(ylab)) ylab=expression(z-value)
-
- a <- sqrt(x$r); b <- x$zz
-
- if(!is.null(a) && !is.null(b)) {
- plot(a, b, main=main, sub=sub, xlab=xlab, ylab=ylab, ...)
- if(curve) lines(x, ...)
- }
- else if (!is.null(a)){
- plot(0, 0, main=main, sub=sub, xlab=xlab, ylab=ylab,
- type="n", xaxt="n", yaxt="n", ...)
- a <- text(mean(a), 0, "No fitting")
- }
-}
-
-lines.msfit <- function(x, col=2, lty=1, ...) {
- v <- x$coef["v"]; c <- x$coef["c"]
- curve(v * x + c / x, add=TRUE, col=col, lty=lty)
-}
-
-summary.msfit <- function(object, digits=3, ...) {
- cat("\nResult of curve fitting for multiscale bootstrap resampling:\n\n")
-
- cat("Estimated p-values:\n")
- pv <- data.frame(object$p, object$se)
- names(pv) <- c("Estimate", "Std. Error"); row.names(pv) <- c("au", "bp")
- print(pv, digits=digits); cat("\n")
-
- cat("Estimated coefficients:\n")
- coef <- object$coef
- print(coef, digits=digits); cat("\n")
-
- cat(paste("Residual sum of squares: ", round(object$rss,digits=digits)),
- ", p-value: ", round(object$pchi, digits=digits),
- " on ", object$df, " DF\n\n", sep="")
-}
-
-seplot <- function(object, type=c("au", "bp"), identify=FALSE,
- main=NULL, xlab=NULL, ylab=NULL, ...)
- {
- if(!is.na(pm <- pmatch(type[1], c("au", "bp")))) {
- wh <- c("au", "bp")[pm]
-
- if(is.null(main))
- main <- "p-value vs standard error plot"
- if(is.null(xlab))
- xlab <- c("AU p-value", "BP value")[pm]
- if(is.null(ylab))
- ylab <- "Standard Error"
-
- plot(object$edges[,wh], object$edges[,paste("se", wh, sep=".")],
- main=main, xlab=xlab, ylab=ylab, ...)
- if(identify)
- identify(x=object$edges[,wh], y=object$edges[,paste("se", wh, sep=".")],
- labels=row.names(object$edges))
- }
- else stop("'type' should be \"au\" or \"bp\".")
- }
+pvclust <- function(data, method.hclust="average",
+ method.dist="correlation", use.cor="pairwise.complete.obs",
+ nboot=1000, r=seq(.5,1.4,by=.1), store=FALSE, weight=FALSE)
+ {
+ # data: (n,p) matrix, n-samples, p-variables
+ n <- nrow(data); p <- ncol(data)
+
+ # hclust for original data
+# METHODS <- c("ward", "single", "complete", "average", "mcquitty",
+# "median", "centroid")
+# method.hclust <- METHODS[pmatch(method.hclust, METHODS)]
+ distance <- dist.pvclust(data, method=method.dist, use.cor=use.cor)
+ data.hclust <- hclust(distance, method=method.hclust)
+
+ # ward -> ward.D
+ if(method.hclust == "ward") method.hclust <- "ward.D"
+
+ # multiscale bootstrap
+ size <- floor(n*r)
+ rl <- length(size)
+
+ if(rl == 1) {
+ if(r != 1.0)
+ warning("Relative sample size r is set to 1.0. AU p-values are not calculated\n")
+
+ r <- list(1.0)
+ }
+ else
+ r <- as.list(size/n)
+
+ mboot <- lapply(r, boot.hclust, data=data, object.hclust=data.hclust, nboot=nboot,
+ method.dist=method.dist, use.cor=use.cor,
+ method.hclust=method.hclust, store=store, weight=weight)
+
+ result <- pvclust.merge(data=data, object.hclust=data.hclust, mboot=mboot)
+
+ return(result)
+ }
+
+plot.pvclust <- function(x, print.pv=TRUE, print.num=TRUE, float=0.01,
+ col.pv=c(2,3,8), cex.pv=0.8, font.pv=NULL,
+ col=NULL, cex=NULL, font=NULL, lty=NULL, lwd=NULL,
+ main=NULL, sub=NULL, xlab=NULL, ...)
+{
+ if(is.null(main))
+ main="Cluster dendrogram with AU/BP values (%)"
+
+ if(is.null(sub))
+ sub=paste("Cluster method: ", x$hclust$method, sep="")
+
+ if(is.null(xlab))
+ xlab=paste("Distance: ", x$hclust$dist.method)
+
+ plot(x$hclust, main=main, sub=sub, xlab=xlab, col=col, cex=cex,
+ font=font, lty=lty, lwd=lwd, ...)
+
+ if(print.pv)
+ text(x, col=col.pv, cex=cex.pv, font=font.pv, float=float, print.num=print.num)
+}
+
+text.pvclust <- function(x, col=c(2,3,8), print.num=TRUE, float=0.01, cex=NULL, font=NULL, ...)
+{
+ axes <- hc2axes(x$hclust)
+ usr <- par()$usr; wid <- usr[4] - usr[3]
+ au <- as.character(round(x$edges[,"au"]*100))
+ bp <- as.character(round(x$edges[,"bp"]*100))
+ rn <- as.character(row.names(x$edges))
+ au[length(au)] <- "au"
+ bp[length(bp)] <- "bp"
+ rn[length(rn)] <- "edge #"
+ a <- text(x=axes[,1], y=axes[,2] + float * wid, au,
+ col=col[1], pos=2, offset=.3, cex=cex, font=font)
+ a <- text(x=axes[,1], y=axes[,2] + float * wid, bp,
+ col=col[2], pos=4, offset=.3, cex=cex, font=font)
+ if(print.num)
+ a <- text(x=axes[,1], y=axes[,2], rn,
+ col=col[3], pos=1, offset=.3, cex=cex, font=font)
+}
+
+print.pvclust <- function(x, which=NULL, digits=3, ...)
+{
+ if(is.null(which)) which <- 1:nrow(x$edges)
+ cat("\n")
+ cat(paste("Cluster method: ", x$hclust$method, "\n", sep=""))
+ cat(paste("Distance : ", x$hclust$dist.method, "\n\n", sep=""))
+ cat("Estimates on edges:\n\n")
+ print(round(x$edges[which,], digits=digits))
+ cat("\n")
+}
+
+summary.pvclust <- function(object, ...){
+ class(object) <- "list"
+ summary(object, ...)
+}
+
+pvrect <- function(x, alpha=0.95, pv="au", type="geq", max.only=TRUE, border=2, ...)
+ {
+ len <- nrow(x$edges)
+ member <- hc2split(x$hclust)$member
+ order <- x$hclust$order
+ usr <- par("usr")
+ xwd <- usr[2] - usr[1]
+ ywd <- usr[4] - usr[3]
+ cin <- par()$cin
+
+ ht <- c()
+ j <- 1
+
+ if(is.na(pm <- pmatch(type, c("geq", "leq", "gt", "lt"))))
+ stop("Invalid type argument: see help(pvrect)")
+
+ for(i in (len - 1):1)
+ {
+ if (pm==1) wh <- (x$edges[i,pv] >= alpha) # Greater than or EQuals
+ else if(pm==2) wh <- (x$edges[i,pv] <= alpha) # Lower than or EQuals
+ else if(pm==3) wh <- (x$edges[i,pv] > alpha) # Greater Than
+ else if(pm==4) wh <- (x$edges[i,pv] > alpha) # Lower Than
+
+ if(wh)
+ {
+ mi <- member[[i]]
+ ma <- match(mi, order)
+
+ if(max.only == FALSE || (max.only && sum(match(ma, ht, nomatch=0)) == 0))
+ {
+ xl <- min(ma)
+ xr <- max(ma)
+ yt <- x$hclust$height[i]
+ yb <- usr[3]
+
+ mx <- xwd / length(member) / 3
+ my <- ywd / 200
+
+ rect(xl - mx, yb + my, xr + mx, yt + my, border=border, shade=NULL, ...)
+
+ j <- j + 1
+ }
+ ht <- c(ht, ma)
+ }
+ }
+ }
+
+msplot <- function(x, edges=NULL, ...)
+ {
+ if(is.null(edges)) edges <- 1:length(x$msfit)
+ d <- length(edges)
+
+ mfrow.bak <- par()$mfrow
+ on.exit(par(mfrow=mfrow.bak))
+
+ par(mfrow=n2mfrow(d))
+
+ for(i in edges) {
+ if(i == 1 || (i %% 10 == 1 && i > 20))
+ main <- paste(i, "st edge", sep="")
+ else if(i == 2 || (i %% 10 == 2 && i > 20))
+ main <- paste(i, "nd edge", sep="")
+ else if(i == 3 || (i %% 10 == 3 && i > 20))
+ main <- paste(i, "rd edge", sep="")
+ else
+ main <- paste(i, "th edge", sep="")
+
+ plot(x$msfit[[i]], main=main, ...)
+ }
+ }
+
+lines.pvclust <- function(x, alpha=0.95, pv="au", type="geq", col=2, lwd=2, ...)
+ {
+ len <- nrow(x$edges)
+ member <- hc2split(x$hclust)$member
+ order <- x$hclust$order
+ usr <- par("usr")
+ xwd <- usr[2] - usr[1]
+ ywd <- usr[4] - usr[3]
+ cin <- par()$cin
+
+ ht <- c()
+ j <- 1
+
+ if(is.na(pm <- pmatch(type, c("geq", "leq", "gt", "lt"))))
+ stop("Invalid type argument: see help(lines.pvclust)")
+
+ for(i in (len - 1):1)
+ {
+ if (pm==1) wh <- (x$edges[i,pv] >= alpha) # Greater than or EQuals
+ else if(pm==2) wh <- (x$edges[i,pv] <= alpha) # Lower than or EQuals
+ else if(pm==3) wh <- (x$edges[i,pv] > alpha) # Greater Than
+ else if(pm==4) wh <- (x$edges[i,pv] > alpha) # Lower Than
+
+ if(wh)
+ {
+ mi <- member[[i]]
+ ma <- match(mi, order)
+
+ if(sum(match(ma, ht, nomatch=0)) == 0)
+ {
+ xl <- min(ma)
+ xr <- max(ma)
+ yt <- x$hclust$height[i]
+ yb <- usr[3]
+
+ mx <- xwd/length(member)/10
+
+ segments(xl-mx, yb, xr+mx, yb, xpd=TRUE, col=col, lwd=lwd, ...)
+
+ j <- j + 1
+ }
+ ht <- c(ht, ma)
+ }
+ }
+ }
+
+pvpick <- function(x, alpha=0.95, pv="au", type="geq", max.only=TRUE)
+ {
+ len <- nrow(x$edges)
+ member <- hc2split(x$hclust)$member
+ order <- x$hclust$order
+
+ ht <- c()
+ a <- list(clusters=list(), edges=c()); j <- 1
+
+ if(is.na(pm <- pmatch(type, c("geq", "leq", "gt", "lt"))))
+ stop("Invalid type argument: see help(pickup)")
+
+ for(i in (len - 1):1)
+ {
+ if (pm==1) wh <- (x$edges[i,pv] >= alpha) # Greater than or Equals
+ else if(pm==2) wh <- (x$edges[i,pv] <= alpha) # Lower than or Equals
+ else if(pm==3) wh <- (x$edges[i,pv] > alpha) # Greater Than
+ else if(pm==4) wh <- (x$edges[i,pv] < alpha) # Lower Than
+
+ if(wh)
+ {
+ mi <- member[[i]]
+ ma <- match(mi, order)
+
+ if(max.only == FALSE || (max.only && sum(match(ma, ht, nomatch=0)) == 0))
+ {
+ a$clusters[[j]] <- x$hclust$labels[mi]
+ a$edges <- c(a$edges,i)
+
+ j <- j + 1
+ }
+ ht <- c(ht, ma)
+ }
+ }
+
+ a$edges <- a$edges[length(a$edges):1]
+ a$clusters <- a$clusters[length(a$edges):1]
+
+ return(a)
+ }
+
+parPvclust <- function(cl=NULL, data, method.hclust="average",
+ method.dist="correlation", use.cor="pairwise.complete.obs",
+ nboot=1000, r=seq(.5,1.4,by=.1), store=FALSE,
+ weight=FALSE,
+ init.rand=TRUE, seed=NULL, iseed=NULL)
+ {
+ # if(!(require(snow))) stop("Package snow is required for parPvclust.")
+ if(!(require(parallel))) stop("Package parallel is required for parPvclust.")
+
+ if((ncl <- length(cl)) < 2 || ncl > nboot) {
+ if(ncl > nboot)
+ warning("Too small value for nboot: non-parallel version is executed.")
+ else
+ warning("Too small (or NULL) cluster: non-parallel version is executed.")
+
+ return(pvclust(data,method.hclust,method.dist,use.cor,nboot,r,store))
+ }
+
+ if(init.rand) {
+ if(is.null(iseed) && !is.null(seed)) {
+ warning("\"seed\" option is deprecated. It is available for back compatibility but will be unavailable in the future.\nConsider using \"iseed\" instead.")
+
+ if(length(seed) != length(cl))
+ stop("seed and cl should have the same length.")
+
+ # setting random seeds
+ parallel::parLapply(cl, as.list(seed), set.seed)
+ } else {
+ parallel::clusterSetRNGStream(cl = cl, iseed = iseed)
+ }
+ }
+
+ # data: (n,p) matrix, n-samples, p-variables
+ n <- nrow(data); p <- ncol(data)
+
+ # hclust for original data
+ METHODS <- c("ward", "single", "complete", "average", "mcquitty",
+ "median", "centroid")
+ method.hclust <- METHODS[pmatch(method.hclust, METHODS)]
+ distance <- dist.pvclust(data, method=method.dist, use.cor=use.cor)
+ data.hclust <- hclust(distance, method=method.hclust)
+
+ # multiscale bootstrap
+ size <- floor(n*r)
+ rl <- length(size)
+
+ if(rl == 1) {
+ if(r != 1.0)
+ warning("Relative sample size r is set to 1.0. AU p-values are not calculated\n")
+
+ r <- list(1.0)
+ }
+ else
+ r <- as.list(size/n)
+
+ nbl <- as.list(rep(nboot %/% ncl,times=ncl))
+
+ if((rem <- nboot %% ncl) > 0)
+ nbl[1:rem] <- lapply(nbl[1:rem], "+", 1)
+
+ cat("Multiscale bootstrap... ")
+
+ mlist <- parallel::parLapply(cl, nbl, pvclust.node,
+ r=r, data=data, object.hclust=data.hclust, method.dist=method.dist,
+ use.cor=use.cor, method.hclust=method.hclust,
+ store=store, weight=weight)
+ cat("Done.\n")
+
+ mboot <- mlist[[1]]
+
+ for(i in 2:ncl) {
+ for(j in 1:rl) {
+ mboot[[j]]$edges.cnt <- mboot[[j]]$edges.cnt + mlist[[i]][[j]]$edges.cnt
+ mboot[[j]]$nboot <- mboot[[j]]$nboot + mlist[[i]][[j]]$nboot
+ mboot[[j]]$store <- c(mboot[[j]]$store, mlist[[i]][[j]]$store)
+ }
+ }
+
+ result <- pvclust.merge( data=data, object.hclust=data.hclust, mboot=mboot)
+
+ return(result)
+ }
+
+msfit <- function(bp, r, nboot) {
+
+ if(length(bp) != length(r))
+ stop("bp and r should have the same length")
+
+ nboot <- rep(nboot, length=length(bp))
+
+ use <- bp > 0 & bp < 1
+
+ p <- se <- c(0,0); names(p) <- names(se) <- c("au", "bp")
+ coef <- c(0,0); names(coef) <- c("v", "c")
+
+ a <- list(p=p, se=se, coef=coef, df=0, rss=0, pchi=0); class(a) <- "msfit"
+
+ if(sum(use) < 2) {
+ # if(mean(bp) < .5) a$p[] <- c(0, 0) else a$p[] <- c(1, 1)
+ if(mean(bp) < .5) a$p[] <- c(0, bp[r==1.0]) else a$p[] <- c(1, bp[r==1.0])
+ return(a)
+ }
+
+ bp <- bp[use]; r <- r[use]; nboot <- nboot[use]
+ zz <- -qnorm(bp)
+ vv <- ((1 - bp) * bp) / (dnorm(zz)^2 * nboot)
+ a$use <- use; a$r <- r; a$zz <- zz
+
+ X <- cbind(sqrt(r), 1/sqrt(r)); dimnames(X) <- list(NULL, c("v","c"))
+ fit <- lsfit(X, zz, 1/vv, intercept=FALSE)
+ a$coef <- coef <- fit$coef
+
+ h.au <- c(1, -1); h.bp <- c(1, 1)
+
+ z.au <- drop(h.au %*% coef); z.bp <- drop(h.bp %*% coef)
+ a$p["au"] <- pnorm(-z.au); a$p["bp"] <- pnorm(-z.bp)
+ V <- solve(crossprod(X, X/vv))
+ vz.au <- drop(h.au %*% V %*% h.au); vz.bp <- drop(h.bp %*% V %*% h.bp)
+ a$se["au"] <- dnorm(z.au) * sqrt(vz.au); a$se["bp"] <- dnorm(z.bp) * sqrt(vz.bp)
+ a$rss <- sum(fit$residual^2/vv)
+
+ if((a$df <- sum(use) - 2) > 0) {
+ a$pchi <- pchisq(a$rss, lower.tail=FALSE, df=a$df)
+ }
+ else a$pchi <- 1.0
+
+ return(a)
+}
+
+plot.msfit <- function(x, curve=TRUE, main=NULL, sub=NULL, xlab=NULL, ylab=NULL, ...)
+{
+ if(is.null(main)) main="Curve fitting for multiscale bootstrap resampling"
+ if(is.null(sub))
+ {
+ sub <- paste("AU = ", round(x$p["au"], digits=2),
+ ", BP = ", round(x$p["bp"], digits=2),
+ ", v = ", round(x$coef["v"], digits=2),
+ ", c = ", round(x$coef["c"], digits=2),
+ ", pchi = ", round(x$pchi, digits=2))
+ }
+ if(is.null(xlab)) xlab=expression(sqrt(r))
+ if(is.null(ylab)) ylab=expression(z-value)
+
+ a <- sqrt(x$r); b <- x$zz
+
+ if(!is.null(a) && !is.null(b)) {
+ plot(a, b, main=main, sub=sub, xlab=xlab, ylab=ylab, ...)
+ if(curve) lines(x, ...)
+ }
+ else if (!is.null(a)){
+ plot(0, 0, main=main, sub=sub, xlab=xlab, ylab=ylab,
+ type="n", xaxt="n", yaxt="n", ...)
+ a <- text(mean(a), 0, "No fitting")
+ }
+}
+
+lines.msfit <- function(x, col=2, lty=1, ...) {
+ v <- x$coef["v"]; c <- x$coef["c"]
+ curve(v * x + c / x, add=TRUE, col=col, lty=lty)
+}
+
+summary.msfit <- function(object, digits=3, ...) {
+ cat("\nResult of curve fitting for multiscale bootstrap resampling:\n\n")
+
+ cat("Estimated p-values:\n")
+ pv <- data.frame(object$p, object$se)
+ names(pv) <- c("Estimate", "Std. Error"); row.names(pv) <- c("au", "bp")
+ print(pv, digits=digits); cat("\n")
+
+ cat("Estimated coefficients:\n")
+ coef <- object$coef
+ print(coef, digits=digits); cat("\n")
+
+ cat(paste("Residual sum of squares: ", round(object$rss,digits=digits)),
+ ", p-value: ", round(object$pchi, digits=digits),
+ " on ", object$df, " DF\n\n", sep="")
+}
+
+seplot <- function(object, type=c("au", "bp"), identify=FALSE,
+ main=NULL, xlab=NULL, ylab=NULL, ...)
+ {
+ if(!is.na(pm <- pmatch(type[1], c("au", "bp")))) {
+ wh <- c("au", "bp")[pm]
+
+ if(is.null(main))
+ main <- "p-value vs standard error plot"
+ if(is.null(xlab))
+ xlab <- c("AU p-value", "BP value")[pm]
+ if(is.null(ylab))
+ ylab <- "Standard Error"
+
+ plot(object$edges[,wh], object$edges[,paste("se", wh, sep=".")],
+ main=main, xlab=xlab, ylab=ylab, ...)
+ if(identify)
+ identify(x=object$edges[,wh], y=object$edges[,paste("se", wh, sep=".")],
+ labels=row.names(object$edges))
+ }
+ else stop("'type' should be \"au\" or \"bp\".")
+ }
diff --git a/man/lung.Rd b/man/lung.Rd
index 06d0462..7ebcdec 100755
--- a/man/lung.Rd
+++ b/man/lung.Rd
@@ -14,6 +14,7 @@ There are 916 observations of genes for each lung tissue.
section in this help for original data source.
}
\examples{
+\donttest{
## Reading the data
data(lung)
@@ -41,6 +42,7 @@ lung.pp$clusters[[2]]
## Print its edge number
lung.pp$edges[2]
+}
## We recommend parallel computing for large dataset as this one
\dontrun{
diff --git a/man/msfit.Rd b/man/msfit.Rd
index 2166d68..2f84619 100755
--- a/man/msfit.Rd
+++ b/man/msfit.Rd
@@ -66,5 +66,5 @@ msfit(bp, r, nboot)
"An approximately unbiased test of phylogenetic tree selection",
\emph{Systematic Biology}, 51, 492-508.
}
-\author{Ryota Suzuki \email{ryota.suzuki at is.titech.ac.jp}}
+\author{Ryota Suzuki \email{suzuki at ef-prime.com}}
\keyword{htest}
\ No newline at end of file
diff --git a/man/msplot.Rd b/man/msplot.Rd
index 4b618ee..3713473 100755
--- a/man/msplot.Rd
+++ b/man/msplot.Rd
@@ -12,5 +12,5 @@ msplot(x, edges=NULL, ...)
\item{...}{other parameters to be used in the function.}
}
\seealso{\code{\link{plot.msfit}}}
-\author{Ryota Suzuki \email{ryota.suzuki at is.titech.ac.jp}}
+\author{Ryota Suzuki \email{suzuki at ef-prime.com}}
\keyword{hplot}
diff --git a/man/plot.pvclust.Rd b/man/plot.pvclust.Rd
index 9dbaf97..b3296de 100755
--- a/man/plot.pvclust.Rd
+++ b/man/plot.pvclust.Rd
@@ -61,5 +61,5 @@
"An approximately unbiased test of phylogenetic tree selection",
\emph{Systematic Biology}, 51, 492-508.
}
-\author{Ryota Suzuki \email{ryota.suzuki at is.titech.ac.jp}}
+\author{Ryota Suzuki \email{suzuki at ef-prime.com}}
\keyword{hplot}
diff --git a/man/print.pvclust.Rd b/man/print.pvclust.Rd
index 0212358..3314756 100755
--- a/man/print.pvclust.Rd
+++ b/man/print.pvclust.Rd
@@ -33,5 +33,5 @@
the boundary.}
\item{pchi}{\eqn{p}-values of chi-square test based on asymptotic theory.}
}
-\author{Ryota Suzuki \email{ryota.suzuki at is.titech.ac.jp}}
+\author{Ryota Suzuki \email{suzuki at ef-prime.com}}
\keyword{print}
\ No newline at end of file
diff --git a/man/pvclust.Rd b/man/pvclust.Rd
index 00fefd4..a71d830 100755
--- a/man/pvclust.Rd
+++ b/man/pvclust.Rd
@@ -1,203 +1,208 @@
-\name{pvclust}
-\alias{pvclust}
-\alias{parPvclust}
-\title{Calculating P-values for Hierchical Clustering}
-\description{
- calculates \eqn{p}-values for hierarchical clustering via
- multiscale bootstrap resampling. Hierarchical clustering is done for
- given data and \eqn{p}-values are computed for each of the clusters.
-}
-\usage{
-pvclust(data, method.hclust="average",
- method.dist="correlation", use.cor="pairwise.complete.obs",
- nboot=1000, r=seq(.5,1.4,by=.1), store=FALSE, weight=FALSE)
-
-parPvclust(cl, data, method.hclust="average",
- method.dist="correlation", use.cor="pairwise.complete.obs",
- nboot=1000, r=seq(.5,1.4,by=.1), store=FALSE, weight=FALSE,
- init.rand=TRUE, seed=NULL)
-
-}
-\arguments{
- \item{data}{numeric data matrix or data frame.}
- \item{method.hclust}{
- the agglomerative method used in hierarchical clustering. This
- should be (an abbreviation of) one of \code{"average"}, \code{"ward"},
- \code{"single"}, \code{"complete"}, \code{"mcquitty"},
- \code{"median"} or \code{"centroid"}. The default is
- \code{"average"}. See \code{method} argument in
- \code{\link[stats]{hclust}}.
- }
- \item{method.dist}{the distance measure to be used. This should be (an
- abbreviation of) one of \code{"correlation"}, \code{"uncentered"},
- \code{"abscor"} or those which are allowed for \code{method}
- argument in \code{dist} function. The default is
- \code{"correlation"}. See \emph{details} section in this help and
- \code{method} argument in \code{\link[stats]{dist}}.
- }
- \item{use.cor}{character string which specifies the method for
- computing correlation with data including missing values. This
- should be (an abbreviation of) one of \code{"all.obs"},
- \code{"complete.obs"} or \code{"pairwise.complete.obs"}. See
- the \code{use} argument in \code{\link[stats]{cor}} function.
- }
- \item{nboot}{the number of bootstrap replications. The default is
- \code{1000}.}
- \item{r}{numeric vector which specifies the relative sample sizes of
- bootstrap replications. For original sample size \eqn{n} and
- bootstrap sample size \eqn{n'}, this is defined as \eqn{r=n'/n}.}
- \item{store}{locical. If \code{store=TRUE}, all bootstrap replications
- are stored in the output object. The default is \code{FALSE}.}
- \item{cl}{\pkg{snow} cluster object which may be generated by
- function \code{makeCluster}. See \code{\link[snow]{snow-startstop}}
- in \pkg{snow} package.}
- \item{weight}{logical. If \code{weight=TRUE}, resampling is made by
- weight vector instead of index vector. Useful for large \code{r}
- value (\code{r>10}). Currently, available only for distance
- \code{"correlation"} and \code{"abscor"}.}
- \item{init.rand}{logical. If \code{init.rand=TRUE}, random number
- generators are initialized at child processes. Random seeds can be
- set by \code{seed} argument.}
- \item{seed}{integer vector of random seeds. It should have the same
- length as \code{cl}. If \code{NULL} is specified,
- \code{1:length(cl)} is used as seed vector. The default is \code{NULL}.}
-}
-\details{
- Function \code{pvclust} conducts multiscale bootstrap resampling to calculate
- \eqn{p}-values for each cluster in the result of hierarchical
- clustering. \code{parPvclust} is the parallel version of this
- procedure which depends on \pkg{snow} package for parallel
- computation.
-
- For data expressed as \eqn{(n \times p)}{(n, p)} matrix or data frame, we
- assume that the data is \eqn{n} observations of \eqn{p} objects, which
- are to be clustered. The \eqn{i}'th row vector corresponds to the
- \eqn{i}'th observation of these objects and the \eqn{j}'th column
- vector corresponds to a sample of \eqn{j}'th object with size \eqn{n}.
-
- There are several methods to measure the dissimilarities between
- objects. For data matrix \eqn{X=\{x_{ij}\}}{X},
- \code{"correlation"}
- method takes
- \deqn{
- 1 - \frac{
- \sum_{i=1}^n (x_{ij} - \bar{x}_j) (x_{ik} - \bar{x}_k)
- }
- {
- \sqrt{\sum_{i=1}^n (x_{ij} - \bar{x}_j)^2}
- \sqrt{\sum_{i=1}^n (x_{ik} - \bar{x}_k)^2}
- }
- }{%
- 1 - cor(X)[j,k]
- }
- for dissimilarity between \eqn{j}'th and \eqn{k}'th object, where
- \eqn{\bar{x}_j = \frac{1}{n} \sum_{i=1}^n x_{ij} \mbox{and}
- \bar{x}_k = \frac{1}{n} \sum_{i=1}^n x_{ik}}{cor is function \code{cor}}.
-
- \code{"uncentered"} takes uncentered sample correlation
- \deqn{
- 1 - \frac{
- \sum_{i=1}^n x_{ij} x_{ik}
- }
- {
- \sqrt{\sum_{i=1}^n x_{ij}^2}
- \sqrt{\sum_{i=1}^n x_{ik}^2}
- }
- }{%
- 1 - sum(x[,j] * x[,k]) / (sqrt(sum(x[,j]^2)) * sqrt(sum(x[,k]^2)))
- }
- and \code{"abscor"} takes the absolute value of sample correlation
- \deqn{
- 1 - \ \Biggl| \frac{
- \sum_{i=1}^n (x_{ij} - \bar{x}_j) (x_{ik} - \bar{x}_k)
- }
- {
- \sqrt{\sum_{i=1}^n (x_{ij} - \bar{x}_j)^2}
- \sqrt{\sum_{i=1}^n (x_{ik} - \bar{x}_k)^2}
- } \Biggl|.
- }{%
- 1 - abs(cor(X)[j,k]).
- }
- }
-\value{
- \item{hclust}{hierarchical clustering for original data generated by
- function \code{hclust}. See \code{\link[stats]{hclust}} for details.}
- \item{edges}{data frame object which contains \eqn{p}-values and
- supporting informations such as standard errors.}
- \item{count}{data frame object which contains primitive information
- about the result of multiscale bootstrap resampling.}
- \item{msfit}{list whose elements are results of curve fitting for
- multiscale bootstrap resampling, of class \code{msfit}. See
- \code{\link{msfit}} for details.}
- \item{nboot}{numeric vector of number of bootstrap replications.}
- \item{r}{numeric vector of the relative sample size for bootstrap
- replications.}
- \item{store}{list contains bootstrap replications if \code{store=TRUE}
- was given for function \code{pvclust} or \code{parPvclust}.}
-}
-\seealso{\code{\link{lines.pvclust}}, \code{\link{print.pvclust}},
- \code{\link{msfit}}, \code{\link{plot.pvclust}},
- \code{\link{text.pvclust}}, \code{\link{pvrect}} and
- \code{\link{pvpick}}.}
-\references{
- Shimodaira, H. (2004)
- "Approximately unbiased tests of regions using multistep-multiscale
- bootstrap resampling",
- \emph{Annals of Statistics}, 32, 2616-2641.
-
- Shimodaira, H. (2002)
- "An approximately unbiased test of phylogenetic tree selection",
- \emph{Systematic Biology}, 51, 492-508.
-
- Suzuki, R. and Shimodaira, H. (2004)
- "An application of multiscale bootstrap resampling to hierarchical
- clustering of microarray data: How accurate are these clusters?",
- \emph{The Fifteenth International Conference on Genome Informatics 2004},
- P034.
-
- \url{http://www.is.titech.ac.jp/~shimo/prog/pvclust/}
-}
-\examples{
-## using Boston data in package MASS
-library(MASS)
-data(Boston)
-
-## multiscale bootstrap resampling
-boston.pv <- pvclust(Boston, nboot=100)
-
-## CAUTION: nboot=100 may be too small for actual use.
-## We suggest nboot=1000 or larger.
-## plot/print functions will be useful for diagnostics.
-
-## plot dendrogram with p-values
-plot(boston.pv)
-
-ask.bak <- par()$ask
-par(ask=TRUE)
-
-## highlight clusters with high au p-values
-pvrect(boston.pv)
-
-## print the result of multiscale bootstrap resampling
-print(boston.pv, digits=3)
-
-## plot diagnostic for curve fitting
-msplot(boston.pv, edges=c(2,4,6,7))
-
-par(ask=ask.bak)
-
-## Print clusters with high p-values
-boston.pp <- pvpick(boston.pv)
-boston.pp
-
-\dontrun{
-## parallel computation via snow package
-library(snow)
-cl <- makeCluster(10, type="MPI")
-
-## parallel version of pvclust
-boston.pv <- parPvclust(cl, Boston, nboot=1000)
-}
-}
-\author{Ryota Suzuki \email{ryota.suzuki at is.titech.ac.jp}}
-\keyword{cluster}
\ No newline at end of file
+\name{pvclust}
+\alias{pvclust}
+\alias{parPvclust}
+\title{Calculating P-values for Hierchical Clustering}
+\description{
+ calculates \eqn{p}-values for hierarchical clustering via
+ multiscale bootstrap resampling. Hierarchical clustering is done for
+ given data and \eqn{p}-values are computed for each of the clusters.
+}
+\usage{
+pvclust(data, method.hclust="average",
+ method.dist="correlation", use.cor="pairwise.complete.obs",
+ nboot=1000, r=seq(.5,1.4,by=.1), store=FALSE, weight=FALSE)
+
+parPvclust(cl=NULL, data, method.hclust="average",
+ method.dist="correlation", use.cor="pairwise.complete.obs",
+ nboot=1000, r=seq(.5,1.4,by=.1), store=FALSE, weight=FALSE,
+ init.rand=TRUE, seed=NULL, iseed=NULL)
+
+}
+\arguments{
+ \item{data}{numeric data matrix or data frame.}
+ \item{method.hclust}{
+ the agglomerative method used in hierarchical clustering. This
+ should be (an abbreviation of) one of \code{"average"}, \code{"ward"},
+ \code{"single"}, \code{"complete"}, \code{"mcquitty"},
+ \code{"median"} or \code{"centroid"}. The default is
+ \code{"average"}. See \code{method} argument in
+ \code{\link[stats]{hclust}}.
+ }
+ \item{method.dist}{the distance measure to be used. This should be (an
+ abbreviation of) one of \code{"correlation"}, \code{"uncentered"},
+ \code{"abscor"} or those which are allowed for \code{method}
+ argument in \code{dist} function. The default is
+ \code{"correlation"}. See \emph{details} section in this help and
+ \code{method} argument in \code{\link[stats]{dist}}.
+ }
+ \item{use.cor}{character string which specifies the method for
+ computing correlation with data including missing values. This
+ should be (an abbreviation of) one of \code{"all.obs"},
+ \code{"complete.obs"} or \code{"pairwise.complete.obs"}. See
+ the \code{use} argument in \code{\link[stats]{cor}} function.
+ }
+ \item{nboot}{the number of bootstrap replications. The default is
+ \code{1000}.}
+ \item{r}{numeric vector which specifies the relative sample sizes of
+ bootstrap replications. For original sample size \eqn{n} and
+ bootstrap sample size \eqn{n'}, this is defined as \eqn{r=n'/n}.}
+ \item{store}{locical. If \code{store=TRUE}, all bootstrap replications
+ are stored in the output object. The default is \code{FALSE}.}
+ \item{cl}{a cluster object created by package \pkg{parallel} or \pkg{snow}.
+ If NULL, use the registered default cluster.}
+ \item{weight}{logical. If \code{weight=TRUE}, resampling is made by
+ weight vector instead of index vector. Useful for large \code{r}
+ value (\code{r>10}). Currently, available only for distance
+ \code{"correlation"} and \code{"abscor"}.}
+% \item{init.rand}{logical. If \code{init.rand=TRUE}, random number
+% generators are initialized at child processes. Random seeds can be
+% set by \code{seed} argument.}
+% \item{seed}{integer vector of random seeds. It should have the same
+% length as \code{cl}. If \code{NULL} is specified,
+% \code{1:length(cl)} is used as seed vector. The default is \code{NULL}.}
+ \item{init.rand}{logical. If \code{init.rand=TRUE}, random number generators are initialized.
+ Use \code{iseed} argument to achieve reproducible results.}
+ \item{seed}{integer vector of random seeds. It should have the same
+ length as \code{cl}. \strong{This argument is duplicated and will be unavailable in the future. Consider using \code{iseed} instead.} }
+ \item{iseed}{an integer to be passed to \code{clusterSetRNGStream} when \code{init.rand=TRUE}.}
+
+}
+\details{
+ Function \code{pvclust} conducts multiscale bootstrap resampling to calculate
+ \eqn{p}-values for each cluster in the result of hierarchical
+ clustering. \code{parPvclust} is the parallel version of this
+ procedure which depends on package \pkg{parallel} for parallel computation.
+
+ For data expressed as \eqn{(n \times p)}{(n, p)} matrix or data frame, we
+ assume that the data is \eqn{n} observations of \eqn{p} objects, which
+ are to be clustered. The \eqn{i}'th row vector corresponds to the
+ \eqn{i}'th observation of these objects and the \eqn{j}'th column
+ vector corresponds to a sample of \eqn{j}'th object with size \eqn{n}.
+
+ There are several methods to measure the dissimilarities between
+ objects. For data matrix \eqn{X=\{x_{ij}\}}{X},
+ \code{"correlation"}
+ method takes
+ \deqn{
+ 1 - \frac{
+ \sum_{i=1}^n (x_{ij} - \bar{x}_j) (x_{ik} - \bar{x}_k)
+ }
+ {
+ \sqrt{\sum_{i=1}^n (x_{ij} - \bar{x}_j)^2}
+ \sqrt{\sum_{i=1}^n (x_{ik} - \bar{x}_k)^2}
+ }
+ }{%
+ 1 - cor(X)[j,k]
+ }
+ for dissimilarity between \eqn{j}'th and \eqn{k}'th object, where
+ \eqn{\bar{x}_j = \frac{1}{n} \sum_{i=1}^n x_{ij} \mbox{and}
+ \bar{x}_k = \frac{1}{n} \sum_{i=1}^n x_{ik}}{cor is function \code{cor}}.
+
+ \code{"uncentered"} takes uncentered sample correlation
+ \deqn{
+ 1 - \frac{
+ \sum_{i=1}^n x_{ij} x_{ik}
+ }
+ {
+ \sqrt{\sum_{i=1}^n x_{ij}^2}
+ \sqrt{\sum_{i=1}^n x_{ik}^2}
+ }
+ }{%
+ 1 - sum(x[,j] * x[,k]) / (sqrt(sum(x[,j]^2)) * sqrt(sum(x[,k]^2)))
+ }
+ and \code{"abscor"} takes the absolute value of sample correlation
+ \deqn{
+ 1 - \ \Biggl| \frac{
+ \sum_{i=1}^n (x_{ij} - \bar{x}_j) (x_{ik} - \bar{x}_k)
+ }
+ {
+ \sqrt{\sum_{i=1}^n (x_{ij} - \bar{x}_j)^2}
+ \sqrt{\sum_{i=1}^n (x_{ik} - \bar{x}_k)^2}
+ } \Biggl|.
+ }{%
+ 1 - abs(cor(X)[j,k]).
+ }
+ }
+\value{
+ \item{hclust}{hierarchical clustering for original data generated by
+ function \code{hclust}. See \code{\link[stats]{hclust}} for details.}
+ \item{edges}{data frame object which contains \eqn{p}-values and
+ supporting informations such as standard errors.}
+ \item{count}{data frame object which contains primitive information
+ about the result of multiscale bootstrap resampling.}
+ \item{msfit}{list whose elements are results of curve fitting for
+ multiscale bootstrap resampling, of class \code{msfit}. See
+ \code{\link{msfit}} for details.}
+ \item{nboot}{numeric vector of number of bootstrap replications.}
+ \item{r}{numeric vector of the relative sample size for bootstrap
+ replications.}
+ \item{store}{list contains bootstrap replications if \code{store=TRUE}
+ was given for function \code{pvclust} or \code{parPvclust}.}
+}
+\seealso{\code{\link{lines.pvclust}}, \code{\link{print.pvclust}},
+ \code{\link{msfit}}, \code{\link{plot.pvclust}},
+ \code{\link{text.pvclust}}, \code{\link{pvrect}} and
+ \code{\link{pvpick}}.}
+\references{
+ Shimodaira, H. (2004)
+ "Approximately unbiased tests of regions using multistep-multiscale
+ bootstrap resampling",
+ \emph{Annals of Statistics}, 32, 2616-2641.
+
+ Shimodaira, H. (2002)
+ "An approximately unbiased test of phylogenetic tree selection",
+ \emph{Systematic Biology}, 51, 492-508.
+
+ Suzuki, R. and Shimodaira, H. (2004)
+ "An application of multiscale bootstrap resampling to hierarchical
+ clustering of microarray data: How accurate are these clusters?",
+ \emph{The Fifteenth International Conference on Genome Informatics 2004},
+ P034.
+
+ \url{http://www.is.titech.ac.jp/~shimo/prog/pvclust/}
+}
+\examples{
+## using Boston data in package MASS
+library(MASS)
+data(Boston)
+
+## multiscale bootstrap resampling
+boston.pv <- pvclust(Boston, nboot=100)
+
+## CAUTION: nboot=100 may be too small for actual use.
+## We suggest nboot=1000 or larger.
+## plot/print functions will be useful for diagnostics.
+
+## plot dendrogram with p-values
+plot(boston.pv)
+
+ask.bak <- par()$ask
+par(ask=TRUE)
+
+## highlight clusters with high au p-values
+pvrect(boston.pv)
+
+## print the result of multiscale bootstrap resampling
+print(boston.pv, digits=3)
+
+## plot diagnostic for curve fitting
+msplot(boston.pv, edges=c(2,4,6,7))
+
+par(ask=ask.bak)
+
+## Print clusters with high p-values
+boston.pp <- pvpick(boston.pv)
+boston.pp
+
+\dontrun{
+## parallel computation
+library(parallel)
+cl <- makeCluster(2, type = "PSOCK")
+
+## parallel version of pvclust
+boston.pv <- parPvclust(cl, Boston, nboot=1000)
+stopCluster(cl)
+}
+}
+\author{Ryota Suzuki \email{suzuki at ef-prime.com}}
+\keyword{cluster}
diff --git a/man/pvpick.Rd b/man/pvpick.Rd
index 7c062e3..cad64a2 100755
--- a/man/pvpick.Rd
+++ b/man/pvpick.Rd
@@ -45,5 +45,5 @@ pvrect(x, alpha=0.95, pv="au", type="geq", max.only=TRUE, border=2, ...)
(number) corresponds to the \eqn{i}'th name vector in
\code{clusters}.}
}
-\author{Ryota Suzuki \email{ryota.suzuki at is.titech.ac.jp}}
+\author{Ryota Suzuki \email{suzuki at ef-prime.com}}
\keyword{aplot}
\ No newline at end of file
diff --git a/man/seplot.Rd b/man/seplot.Rd
index a12b7fa..fe3bc4f 100755
--- a/man/seplot.Rd
+++ b/man/seplot.Rd
@@ -19,5 +19,5 @@ xlab=NULL, ylab=NULL, ...)
\item{...}{other graphical parameters to be passed to generic
\code{plot} or \code{identify} function.}
}
-\author{Ryota Suzuki \email{ryota.suzuki at is.titech.ac.jp}}
+\author{Ryota Suzuki \email{suzuki at ef-prime.com}}
\keyword{hplot}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/pvclust.git
More information about the debian-med-commit
mailing list