[med-svn] [r-bioc-edger] 01/01: New upstream version 3.14.0+dfsg
Andreas Tille
tille at debian.org
Fri Sep 8 20:33:39 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch upstream
in repository r-bioc-edger.
commit 9f3654a818c526a41096915299a6fa071a7188b0
Author: Andreas Tille <tille at debian.org>
Date: Fri Sep 8 22:20:31 2017 +0200
New upstream version 3.14.0+dfsg
---
DESCRIPTION | 10 +-
NAMESPACE | 12 +-
R/DGEList.R | 15 +-
R/S3methods.R | 2 +-
R/addPriorCount.R | 29 --
R/adjustedProfileLik.R | 68 ++-
R/aveLogCPM.R | 60 ++-
R/cpm.R | 25 +-
R/decidetestsDGE.R | 57 +--
R/diffSpliceDGE.R | 5 +-
R/dispBinTrend.R | 7 +-
R/estimateDisp.R | 82 +--
R/estimateGLMCommonDisp.R | 2 +-
R/expandAsMatrix.R | 18 +-
R/geneset-DGEList.R | 26 +-
R/gini.R | 11 +-
R/glmQLFTest.R | 61 ++-
R/glmTreat.R | 132 ++---
R/glmfit.R | 20 +-
R/locfitByCol.R | 5 +-
R/makeCompressedMatrix.R | 164 ------
R/mglmLevenberg.R | 60 ++-
R/mglmOneGroup.R | 36 +-
R/mglmOneWay.R | 29 +-
R/nbinomDeviance.R | 50 +-
R/normalizeChIPtoInput.R | 6 +-
R/plotMD.R | 45 +-
R/plotMDS.DGEList.R | 11 +-
R/predFC.R | 34 +-
R/readDGE.R | 18 +-
R/scaleOffset.R | 37 --
R/spliceVariants.R | 10 +-
R/topTags.R | 2 +-
build/vignette.rds | Bin 228 -> 228 bytes
inst/CITATION | 37 +-
inst/NEWS.Rd | 66 ---
inst/doc/edgeR.pdf | Bin 45765 -> 48664 bytes
man/addPriorCount.Rd | 61 ---
man/aveLogCPM.Rd | 4 +-
man/camera.DGEList.Rd | 17 +-
man/decidetestsDGE.Rd | 68 +--
man/dim.Rd | 7 +
man/edgeR-package.Rd | 25 +-
man/estimateDisp.Rd | 9 +-
man/expandAsMatrix.Rd | 21 +-
man/gini.Rd | 2 +-
man/glmQLFTest.Rd | 21 +-
man/glmTreat.Rd | 20 +-
man/goodTuring.Rd | 6 +-
man/makeCompressedMatrix.Rd | 108 ----
man/nbinomDeviance.Rd | 6 +-
man/plotMD.Rd | 31 +-
man/plotMDS.DGEList.Rd | 8 +-
man/plotQLDisp.Rd | 11 +-
man/plotSmear.Rd | 17 +-
man/predFC.Rd | 7 +-
man/processAmplicons.Rd | 34 +-
man/roast.DGEList.Rd | 32 +-
man/scaleOffset.Rd | 35 --
man/topTags.Rd | 2 +-
src/R_add_prior_count.cpp | 64 ---
src/R_add_repeat_matrices.cpp | 41 --
src/R_ave_log_cpm.cpp | 62 ---
src/R_calculate_cpm.cpp | 87 ----
src/R_check_counts.cpp | 137 -----
src/R_check_poisson_bound.cpp | 50 --
src/R_compute_apl.cpp | 111 -----
src/R_compute_nbdev.cpp | 113 ++---
src/R_cr_adjust.cpp | 43 ++
src/R_get_one_way_fitted.cpp | 60 ---
src/R_initialize_levenberg.cpp | 209 --------
src/R_levenberg.cpp | 125 ++---
src/R_one_group.cpp | 136 ++---
src/add_prior.cpp | 54 --
src/add_prior.h | 25 -
src/adj_coxreid.cpp | 63 +--
src/fmm_spline.c | 17 +-
src/glm.h | 17 +-
src/glm_levenberg.cpp | 123 +++--
src/glm_one_group.cpp | 20 +-
src/init.cpp | 24 +-
src/matvec_check.cpp | 178 ++-----
src/matvec_check.h | 32 +-
src/nbdev.cpp | 2 +-
src/utils.h | 42 +-
tests/edgeR-Tests.Rout.save | 1079 ++++++++++++++++++++--------------------
86 files changed, 1503 insertions(+), 3215 deletions(-)
diff --git a/DESCRIPTION b/DESCRIPTION
index 4e08836..245d4e8 100755
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,14 +1,14 @@
Package: edgeR
-Version: 3.18.1
-Date: 2017-04-17
+Version: 3.14.0
+Date: 2016-04-19
Title: Empirical Analysis of Digital Gene Expression Data in R
Description: Differential expression analysis of RNA-seq expression profiles with biological replication. Implements a range of statistical methodology based on the negative binomial distributions, including empirical Bayes estimation, exact tests, generalized linear models and quasi-likelihood tests. As well as RNA-seq, it be applied to differential signal analysis of other types of genomic data that produce counts, including ChIP-seq, SAGE and CAGE.
Author: Yunshun Chen <yuchen at wehi.edu.au>, Aaron Lun <alun at wehi.edu.au>, Davis McCarthy <dmccarthy at wehi.edu.au>, Xiaobei Zhou <xiaobei.zhou at uzh.ch>, Mark Robinson <mark.robinson at imls.uzh.ch>, Gordon Smyth <smyth at wehi.edu.au>
Maintainer: Yunshun Chen <yuchen at wehi.edu.au>, Aaron Lun <alun at wehi.edu.au>, Mark Robinson <mark.robinson at imls.uzh.ch>, Davis McCarthy <dmccarthy at wehi.edu.au>, Gordon Smyth <smyth at wehi.edu.au>
License: GPL (>=2)
Depends: R (>= 2.15.0), limma
-Imports: graphics, stats, utils, methods, locfit
-Suggests: MASS, statmod, splines, KernSmooth
+Imports: graphics, stats, utils, methods
+Suggests: MASS, statmod, splines, locfit, KernSmooth
URL: http://bioinf.wehi.edu.au/edgeR
biocViews: GeneExpression, Transcription, AlternativeSplicing,
Coverage, DifferentialExpression, DifferentialSplicing,
@@ -16,4 +16,4 @@ biocViews: GeneExpression, Transcription, AlternativeSplicing,
TimeCourse, SAGE, Sequencing, ChIPSeq, RNASeq, BatchEffect,
MultipleComparison, Normalization, QualityControl
NeedsCompilation: yes
-Packaged: 2017-05-05 22:32:41 UTC; biocbuild
+Packaged: 2016-05-04 03:09:43 UTC; biocbuild
diff --git a/NAMESPACE b/NAMESPACE
index 91086b0..32babb5 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -8,7 +8,6 @@ exportClasses("DGEList","DGEExact","DGEGLM","DGELRT","TopTags")
exportMethods("show")
import(methods)
-import(locfit)
importFrom("graphics", "abline", "axis", "curve", "grid", "legend",
"lines", "mtext", "plot", "points", "smoothScatter")
importFrom("stats", "approxfun", "as.dist", "chisq.test", "cmdscale",
@@ -23,8 +22,8 @@ if( tools:::.OStype() == "windows" ) importFrom("utils", "winMenuAddItem")
importFrom("limma",camera,contrastAsCoef,decideTests,fry,goana,kegga,is.fullrank,
lmFit,loessFit,mroast,nonEstimable,plotMD,plotMDS,plotWithHighlights,removeExt,
- roast,romer,squeezeVar,subsetListOfArrays,zscoreGamma,zscoreT)
-importClassesFrom("limma","LargeDataObject","Roast","MDS","TestResults")
+ roast,romer,squeezeVar,subsetListOfArrays,zscoreGamma)
+importClassesFrom("limma","LargeDataObject","Roast","MDS")
S3method(as.data.frame,TopTags)
S3method(as.matrix,DGEList)
@@ -41,8 +40,6 @@ S3method(dim,DGEExact)
S3method(dim,DGEGLM)
S3method(dim,DGELRT)
S3method(dim,TopTags)
-S3method(decideTests,DGEExact)
-S3method(decideTests,DGELRT)
S3method(dimnames,DGEList)
S3method(dimnames,DGEExact)
S3method(dimnames,DGEGLM)
@@ -76,6 +73,11 @@ S3method(goana,DGEExact)
S3method(goana,DGELRT)
S3method(kegga,DGEExact)
S3method(kegga,DGELRT)
+S3method(length,DGEList)
+S3method(length,DGEExact)
+S3method(length,TopTags)
+S3method(length,DGEGLM)
+S3method(length,DGELRT)
S3method(mroast,DGEList)
S3method(plotMD,DGEGLM)
S3method(plotMD,DGELRT)
diff --git a/R/DGEList.R b/R/DGEList.R
index 9bdcddb..4f1717a 100644
--- a/R/DGEList.R
+++ b/R/DGEList.R
@@ -1,6 +1,6 @@
DGEList <- function(counts=matrix(0,0,0), lib.size=colSums(counts), norm.factors=rep(1,ncol(counts)), samples=NULL, group=NULL, genes=NULL, remove.zeros=FALSE)
# Construct DGEList object from components, with some checking
-# Last modified 3 Oct 2016
+# Last modified 3 Dec 2015
{
# Check counts
counts <- as.matrix(counts)
@@ -8,12 +8,11 @@ DGEList <- function(counts=matrix(0,0,0), lib.size=colSums(counts), norm.factors
ntags <- nrow(counts)
if(nlib>0L && is.null(colnames(counts))) colnames(counts) <- paste0("Sample",1L:nlib)
if(ntags>0L && is.null(rownames(counts))) rownames(counts) <- 1L:ntags
- .isAllZero(counts) # don't really care about all-zeroes, but do want to protect against NA's, negative values.
# Check lib.size
if(is.null(lib.size)) lib.size <- colSums(counts)
- if(nlib != length(lib.size)) stop("length of 'lib.size' must equal number of columns in 'counts'")
- if(any(lib.size==0L)) warning("library size of zero detected")
+ if(nlib != length(lib.size)) stop("Length of 'lib.size' must equal number of columns in 'counts'")
+ if(any(lib.size==0L)) warning("Zero library size detected.")
# Check norm.factors
if(is.null(norm.factors)) norm.factors <- rep(1,ncol(counts))
@@ -70,11 +69,3 @@ DGEList <- function(counts=matrix(0,0,0), lib.size=colSums(counts), norm.factors
x
}
-.isAllZero <- function(y)
-# Function to check if all counts are zero in a memory-efficient manner.
-# Also checks and throws an error if NA or negative counts are present.
-{
- is.zero <- .Call(.cR_check_counts, y)
- if(is.character(is.zero)) stop(is.zero)
- return(is.zero)
-}
diff --git a/R/S3methods.R b/R/S3methods.R
index 8e2e55b..116cd0c 100644
--- a/R/S3methods.R
+++ b/R/S3methods.R
@@ -19,7 +19,7 @@ dim.DGEExact <- dim.TopTags <- dim.DGELRT <- function(x) if(is.null(x$table)) c(
# S3 length methods
-#length.DGEList <- length.DGEExact <- length.TopTags <- length.DGEGLM <- length.DGELRT <- function(x) prod(dim(x))
+length.DGEList <- length.DGEExact <- length.TopTags <- length.DGEGLM <- length.DGELRT <- function(x) prod(dim(x))
# S3 dimnames methods
# These enable rownames() and colnames() as well
diff --git a/R/addPriorCount.R b/R/addPriorCount.R
deleted file mode 100644
index 4f45262..0000000
--- a/R/addPriorCount.R
+++ /dev/null
@@ -1,29 +0,0 @@
-addPriorCount <- function(y, lib.size=NULL, offset=NULL, prior.count=1)
-# Add library size-adjusted prior counts to values of 'y'.
-# Also add twice the adjusted prior to the library sizes,
-# which are provided as log-transformed values in 'offset'.
-#
-# written by Aaron Lun
-# created 26 September 2016
-{
-# Check y
- if (!is.numeric(y)) stop('count matrix must be numeric')
- y <- as.matrix(y)
-
-# Check prior.count
- prior.count <- .compressPrior(prior.count)
-
-# Check lib.size and offset.
-# If offsets are provided, they must have a similar average to log(lib.size)
-# for the results to be meaningful as logCPM values
- offset <- .compressOffsets(y, lib.size=lib.size, offset=offset)
-
-# Adding the prior count.
- out <- .Call(.cR_add_prior_count, y, offset, prior.count)
- if (is.character(out)) stop(out)
-
- names(out) <- c("y", "offset")
- out$offset <- makeCompressedMatrix(out$offset, byrow=TRUE)
- return(out)
-}
-
diff --git a/R/adjustedProfileLik.R b/R/adjustedProfileLik.R
index 9344aca..bc7266b 100644
--- a/R/adjustedProfileLik.R
+++ b/R/adjustedProfileLik.R
@@ -6,41 +6,55 @@ adjustedProfileLik <- function(dispersion, y, design, offset, weights=NULL, adju
# Yunshun Chen, Gordon Smyth, Aaron Lun
# Created June 2010. Last modified 27 July 2015.
{
-# Checking counts
- if (!is.numeric(y)) stop("counts must be numeric")
- y <- as.matrix(y)
-
-# Checking offsets
- if (!is.double(offset)) storage.mode(offset) <- "double"
- offset <- makeCompressedMatrix(offset, byrow=TRUE)
-
-# Checking dispersion
- if (!is.double(dispersion)) storage.mode(dispersion) <- "double"
- dispersion <- makeCompressedMatrix(dispersion, byrow=FALSE)
-
-# Checking weights
- if(is.null(weights)) weights <- 1
- if (!is.double(weights)) storage.mode(weights) <- "double"
- weights <- makeCompressedMatrix(weights, byrow=TRUE)
-
+ if(any(dim(y)!=dim(offset))) offset <- expandAsMatrix(offset,dim(y))
+ ntags <- nrow(y)
+ nlibs <- ncol(y)
+ dispersion <- expandAsMatrix(dispersion,dim(y),byrow=FALSE)
+
# Fit tagwise linear models. This is actually the most time-consuming
# operation that I can see for this function.
fit <- glmFit(y,design=design,dispersion=dispersion,offset=offset,prior.count=0,weights=weights,start=start)
- mu <- fit$fitted
-# Check other inputs to C++ code
- adjust <- as.logical(adjust)
- if (!is.double(design)) storage.mode(design)<-"double"
+# Compute log-likelihood.
+ mu <- fit$fitted
+ if(is.null(weights)) weights <- 1
+ if(all(dispersion == 0)){
+# loglik <- rowSums(weights*dpois(y,lambda=mu,log = TRUE))
+ ll <- y*log(mu)-mu-lgamma(y+1)
+ ll[mu==0] <- 0
+ loglik <- rowSums(weights*ll)
+
+ } else {
+# loglik <- rowSums(weights*dnbinom(y,size=1/dispersion,mu=mu,log = TRUE))
+ r <- 1/dispersion
+ ll <- y*log(mu)-y*log(mu+r)+r*log(r)-r*log(mu+r)+lgamma(y+r)-lgamma(y+1)-lgamma(r)
+ ll[mu==0] <- 0
+ loglik <- rowSums(weights*ll)
+ }
+ if (!adjust) {
+ return(loglik)
+ }
+
+# Calculating the Cox-Reid adjustment.
+ if(ncol(design)==1) {
+ D <- rowSums(weights*mu/(1+mu*dispersion))
+ cr <- 0.5*log(abs(D))
+ } else {
+ W <- weights*mu/(1+dispersion*mu)
-# Compute adjusted log-likelihood
- apl <- .Call(.cR_compute_apl, y, mu, dispersion, weights, adjust, design)
- if (is.character(apl)) stop(apl)
+# Checking type, otherwise the C++ code will complain.
+# Note the use of a transposed matrix for easy row access in column-major format.
+ if (!is.double(W)) storage.mode(W)<-"double"
+ if (!is.double(design)) storage.mode(design)<-"double"
+ cr <- .Call(.cR_cr_adjust, t(W), design, nrow(design))
+ if (is.character(cr)) { stop(cr) }
+ }
# Deciding what to return.
- if (get.coef) {
- return(list(apl=apl, beta=fit$coefficients))
+ if (get.coef) {
+ return(list(apl=loglik-cr, beta=fit$coefficients))
} else {
- return(apl)
+ return(loglik - cr)
}
}
diff --git a/R/aveLogCPM.R b/R/aveLogCPM.R
index daf00b5..0b3c601 100644
--- a/R/aveLogCPM.R
+++ b/R/aveLogCPM.R
@@ -39,40 +39,54 @@ aveLogCPM.default <- function(y,lib.size=NULL,offset=NULL,prior.count=2,dispersi
# Gordon Smyth
# Created 25 Aug 2012. Last modified 25 Sep 2014.
{
-# Special case when all counts and library sizes are zero
+# Check y
y <- as.matrix(y)
- if(.isAllZero(y)) {
- if ((is.null(lib.size) || max(lib.size)==0) && (is.null(offset) || max(offset)==-Inf)) {
- abundance <- rep(-log(nrow(y)),nrow(y))
- return( (abundance+log(1e6)) / log(2) )
- }
- }
+ if(any(is.na(y))) stop("NA counts not allowed")
+ if(any(y<0)) stop("counts must be non-negative")
+
+# Check prior.count
+ neg.prior <- prior.count < 0
+ if(any(neg.prior)) prior.count[neg.prior] <- 0
# Check dispersion
if(is.null(dispersion)) dispersion <- 0.05
- isna <- is.na(dispersion) # ???
+ isna <- is.na(dispersion)
if(all(isna)) dispersion <- 0.05
if(any(isna)) dispersion[isna] <- mean(dispersion,na.rm=TRUE)
- dispersion <- .compressDispersions(dispersion)
+# Check lib.size and offset.
+# If offset is provided, it takes precedence over lib.size.
+# However it must have a similar average to log(lib.size)
+# for the results to be meaningful as logCPM values
+ if(is.null(offset)) {
+ if(is.null(lib.size)) lib.size <- colSums(y)
+ } else {
+ lib.size <- exp(offset)
+ }
+ mean.lib.size <- mean(lib.size)
-# Check weights
- weights <- .compressWeights(weights)
+# Special case when all counts are zero
+ if(mean.lib.size==0) {
+ abundance <- rep(-log(nrow(y)),nrow(y))
+ return( (abundance+log(1e6)) / log(2) )
+ }
-# Check offsets
- offset <- .compressOffsets(y, lib.size=lib.size, offset=offset)
+# Ensuring lib.size has appropriate dimensions for prior.count
+ if(length(prior.count)>1L) {
+ if(nrow(y)!=length(prior.count)) stop("length of prior count vector should be equal to the number of rows")
+ lib.size <- expandAsMatrix(lib.size, dim(y))
+ }
-# Check prior counts
- prior.count <- .compressPrior(prior.count)
+# Scale prior counts to preserve fold changes
+ prior.count.scaled <- lib.size/mean.lib.size*prior.count
-# Retrieve GLM fitting parameters
- maxit <- formals(mglmOneGroup)$maxit
- tol <- formals(mglmOneGroup)$tol
+# Add double prior counts to library sizes
+ offset <- log(lib.size+2*prior.count.scaled)
-# Calling the C++ code
- ab <- .Call(.cR_ave_log_cpm, y, offset, prior.count, dispersion, weights, maxit, tol)
- if (is.character(ab)) stop(ab)
+# Add prior counts to y
+ if (is.null(dim(prior.count.scaled))) prior.count.scaled <- expandAsMatrix(prior.count.scaled, dim(y))
+ y <- y+prior.count.scaled
- return(ab)
+ abundance <- mglmOneGroup(y,dispersion=dispersion,offset=offset,weights=weights)
+ (abundance+log(1e6)) / log(2)
}
-
diff --git a/R/cpm.R b/R/cpm.R
index 5f6d041..6a8f7d3 100644
--- a/R/cpm.R
+++ b/R/cpm.R
@@ -14,29 +14,20 @@ cpm.DGEList <- function(x, normalized.lib.sizes=TRUE, log=FALSE, prior.count=0.2
cpm.default <- function(x, lib.size=NULL, log=FALSE, prior.count=0.25, ...)
# Counts per million for a matrix
# Davis McCarthy and Gordon Smyth.
-# Created 20 June 2011. Last modified 03 October 2016
+# Created 20 June 2011. Last modified 11 March 2016
{
x <- as.matrix(x)
if (any(dim(x)==0L)) {
return(x)
}
-
if(is.null(lib.size)) lib.size <- colSums(x)
- if(!is.double(lib.size)) storage.mode(lib.size) <- "double"
- lib.size <- makeCompressedMatrix(lib.size, byrow=TRUE)
- err <- .Call(.cR_check_nonnegative, lib.size, "library sizes")
- if (is.character(err)) stop(err)
-
- # Calculating in C++ for max efficiency
if(log) {
- prior.count <- .compressPrior(prior.count)
- out <- .Call(.cR_calculate_cpm_log, x, lib.size, prior.count)
- } else {
- out <- .Call(.cR_calculate_cpm_raw, x, lib.size)
+ prior.count.scaled <- lib.size/mean(lib.size)*prior.count
+ lib.size <- lib.size+2*prior.count.scaled
}
- if (is.character(out)) stop(out)
-
- # Cleaning up
- dimnames(out) <- dimnames(x)
- out
+ lib.size <- 1e-6*lib.size
+ if(log)
+ log2(t( (t(x)+prior.count.scaled) / lib.size ))
+ else
+ t(t(x)/lib.size)
}
diff --git a/R/decidetestsDGE.R b/R/decidetestsDGE.R
index a3a968c..88bd1ab 100644
--- a/R/decidetestsDGE.R
+++ b/R/decidetestsDGE.R
@@ -1,55 +1,14 @@
-# decideTestsDGE.R
-
-decideTests.DGEExact <- decideTests.DGELRT <- function(object,adjust.method="BH",p.value=0.05,lfc=0,...)
-{
- decideTestsDGE(object=object,adjust.method=adjust.method,p.value=p.value,lfc=lfc)
-}
+# DECIDETESTSDGE.R
decideTestsDGE <- function(object,adjust.method="BH",p.value=0.05,lfc=0)
-# Accept or reject hypothesis tests across genes and contrasts
-# edgeR team. Original author was Davis McCarthy.
-# Created 15 August 2010. Last modified 6 Jan 2017.
+ ## Accept or reject hypothesis tests across genes and contrasts
+ ## Davis McCarthy
+ ## 15 August 2010. Last modified 17 Oct 2014 (Mark Robinson).
{
-# Check object class
- if( !(is(object,"DGEExact") || is(object,"DGELRT")) ) stop("Need DGEExact or DGELRT object")
-
-# Apply multiple testing
- p <- object$table$PValue
- p <- p.adjust(p, method=adjust.method)
- isDE <- as.integer(p < p.value)
-
-# Extract logFC
- logFC <- object$table$logFC
-
-# Check for F-test with multiple logFC columns
- FTest <- is.null(logFC)
-
-# With multiple contrasts, apply lfc threshold to maximum logFC
- if(FTest) {
- if(lfc>0) {
- coef.col <- grep("^logFC",colnames(object$table))
- logFC <- object$table[,coef.col]
- SmallFC <- rowSums(abs(logFC) >= lfc) == 0
- isDE[SmallFC] <- 0L
- }
-
-# With single contrast, apply directionality and lfc threshold
- } else {
- isDE[isDE & logFC<0] <- -1L
- SmallFC <- (abs(logFC) < lfc)
- isDE[SmallFC] <- 0L
- }
+ if(!is(object,"DGEExact") & !is(object,"DGELRT")) stop("Need DGEExact or DGELRT object") # Expects a DGEExact or DGELRT object
+ decideTests(new("MArrayLM", list(p.value=object$table$PValue, coefficients=object$table$logFC)),
+ method="separate", adjust.method=adjust.method, p.value=p.value, lfc=lfc)
+}
-# Assemble TestResults object
- isDE <- matrix(isDE, ncol=1)
- row.names(isDE) <- row.names(object)
- colnames(isDE) <- paste(object$comparison,collapse="+")
-# Record possible values
- if(FTest)
- attr(isDE,"levels") <- c(0L,1L)
- else
- attr(isDE,"levels") <- c(-1L,0L,1L)
- new("TestResults", isDE)
-}
diff --git a/R/diffSpliceDGE.R b/R/diffSpliceDGE.R
index 8455bf5..630da6d 100644
--- a/R/diffSpliceDGE.R
+++ b/R/diffSpliceDGE.R
@@ -2,7 +2,7 @@ diffSpliceDGE <- function(glmfit, coef=ncol(glmfit$design), contrast=NULL, genei
{
# Identify exons and genes with splice variants using negative binomial GLMs
# Yunshun Chen and Gordon Smyth
-# Created 29 March 2014. Last modified 03 October 2016.
+# Created 29 March 2014. Last modified 25 September 2015.
# Check if glmfit is from glmFit() or glmQLFit()
isLRT <- is.null(glmfit$df.prior)
@@ -106,8 +106,7 @@ diffSpliceDGE <- function(glmfit, coef=ncol(glmfit$design), contrast=NULL, genei
gene.betabar <- fit.gene$coefficients[g, coef, drop=FALSE]
# New offset
- offset.new <- .addCompressedMatrices(makeCompressedMatrix(glmfit$offset),
- makeCompressedMatrix(gene.betabar %*% t(design[,coef,drop=FALSE])))
+ offset.new <- glmfit$offset + gene.betabar %*% t(design[, coef, drop=FALSE])
coefficients <- beta - gene.betabar
# Testing
diff --git a/R/dispBinTrend.R b/R/dispBinTrend.R
index 1382a55..d0fb75a 100644
--- a/R/dispBinTrend.R
+++ b/R/dispBinTrend.R
@@ -30,7 +30,7 @@ dispBinTrend <- function(y, design=NULL, offset=NULL, df=5, span=0.3, min.n=400,
method.trend <- match.arg(method.trend, c("spline", "loess"))
# Check AveLogCPM
- if(is.null(AveLogCPM)) AveLogCPM <- aveLogCPM(y, offset=offset, weights=weights)
+ if(is.null(AveLogCPM)) AveLogCPM <- aveLogCPM(y,weights=weights)
# Define bins of genes based on min.n in each bin
# All zero rows are marked as group==0
@@ -57,9 +57,8 @@ dispBinTrend <- function(y, design=NULL, offset=NULL, df=5, span=0.3, min.n=400,
bin.d <- bin.A <- rep(0,nbins)
for(i in 1:nbins) {
bin <- group==i
- binAve <- AveLogCPM[bin]
- bin.d[i] <- estimateGLMCommonDisp(y[bin,], design, method=method.bin, offset[bin,], min.row.sum=0, weights=weights[bin,], AveLogCPM=binAve, ...)
- bin.A[i] <- mean(binAve)
+ bin.d[i] <- estimateGLMCommonDisp(y[bin,], design, method=method.bin, offset[bin,], min.row.sum=0, weights=weights[bin,] ,...)
+ bin.A[i] <- mean(AveLogCPM[bin])
}
# If just one bin, trended dispersion is constant
diff --git a/R/estimateDisp.R b/R/estimateDisp.R
index 7b94170..149018d 100644
--- a/R/estimateDisp.R
+++ b/R/estimateDisp.R
@@ -12,7 +12,6 @@ estimateDisp.DGEList <- function(y, design=NULL, prior.df=NULL, trend.method="lo
d <- estimateDisp(y=y$counts, design=design, group=group, lib.size=lib.size, offset=getOffset(y), prior.df=prior.df, trend.method=trend.method, mixed.df=mixed.df, tagwise=tagwise, span=span, min.row.sum=min.row.sum, grid.length=grid.length, grid.range=grid.range, robust=robust, winsor.tail.p=winsor.tail.p, tol=tol, weights=y$weights, ...)
- if(!is.null(design)) y$design <- design
y$common.dispersion <- d$common.dispersion
y$trended.dispersion <- d$trended.dispersion
if(tagwise) y$tagwise.dispersion <- d$tagwise.dispersion
@@ -27,7 +26,7 @@ estimateDisp.DGEList <- function(y, design=NULL, prior.df=NULL, trend.method="lo
estimateDisp.default <- function(y, design=NULL, group=NULL, lib.size=NULL, offset=NULL, prior.df=NULL, trend.method="locfit", mixed.df=FALSE, tagwise=TRUE, span=NULL, min.row.sum=5, grid.length=21, grid.range=c(-10,10), robust=FALSE, winsor.tail.p=c(0.05,0.1), tol=1e-06, weights=NULL, ...)
# Use GLM approach if a design matrix is given, and classic approach otherwise.
# It calculates a matrix of likelihoods for each gene at a set of dispersion grid points, and then calls WLEB() to do the shrinkage.
-# Yunshun Chen, Aaron Lun, Gordon Smyth. Created July 2012. Last modified 03 Oct 2016.
+# Yunshun Chen, Gordon Smyth. Created July 2012. Last modified 18 March 2016.
{
# Check y
y <- as.matrix(y)
@@ -48,16 +47,11 @@ estimateDisp.default <- function(y, design=NULL, group=NULL, lib.size=NULL, offs
if(length(lib.size)!=nlibs) stop("Incorrect length of lib.size.")
# Check offset
- offset <- .compressOffsets(y, lib.size=lib.size, offset=offset)
-
-# Check weights
- weights <- .compressWeights(weights)
+ if(is.null(offset)) offset <- log(lib.size)
+ offset <- expandAsMatrix(offset, dim(y))
# Check for genes with small counts
sel <- rowSums(y) >= min.row.sum
- sely <- .subsetMatrixWithoutCopying(y, i=sel)
- seloffset <- .subsetMatrixWithoutCopying(offset, i=sel)
- selweights <- .subsetMatrixWithoutCopying(weights, i=sel)
# Spline points
spline.pts <- seq(from=grid.range[1], to=grid.range[2], length=grid.length)
@@ -100,16 +94,14 @@ estimateDisp.default <- function(y, design=NULL, group=NULL, lib.size=NULL, offs
return(list(common.dispersion=NA, trended.dispersion=NA, tagwise.dispersion=NA))
}
- # Identify which observations have means of zero (weights aren't needed here).
- glmfit <- glmFit(sely, design, offset=seloffset, dispersion=0.05, prior.count=0)
- zerofit <- .areFittedValuesZero(y=glmfit$counts, mu=glmfit$fitted.values)
+ # Protect against zeros.
+ glmfit <- glmFit(y[sel,], design, offset=offset[sel,], dispersion=0.05, prior.count=0)
+ zerofit <- (glmfit$fitted.values < 1e-4) & (glmfit$counts < 1e-4)
by.group <- .comboGroups(zerofit)
for (subg in by.group) {
cur.nzero <- !zerofit[subg[1],]
if (!any(cur.nzero)) { next }
-
- # Removing samples with zero means from design, to avoid attempts to converge to -Inf.
if (all(cur.nzero)) {
redesign <- design
} else {
@@ -119,15 +111,11 @@ estimateDisp.default <- function(y, design=NULL, group=NULL, lib.size=NULL, offs
if (nrow(redesign) == ncol(redesign)) { next }
}
- cury <- .subsetMatrixWithoutCopying(sely, i=subg, j=cur.nzero)
- curo <- .subsetMatrixWithoutCopying(seloffset, i=subg, j=cur.nzero)
- curw <- .subsetMatrixWithoutCopying(selweights, i=subg, j=cur.nzero)
-
# Using the last fit to hot-start the next fit
last.beta <- NULL
- for(i in seq_len(grid.length)) {
- out <- adjustedProfileLik(spline.disp[i], y=cury, design=redesign,
- offset=curo, weights=curw, start=last.beta, get.coef=TRUE)
+ for(i in 1:grid.length) {
+ out <- adjustedProfileLik(spline.disp[i], y=y[sel, ][subg,cur.nzero,drop=FALSE], design=redesign,
+ offset=offset[sel,][subg,cur.nzero,drop=FALSE], start=last.beta, get.coef=TRUE)
l0[subg,i] <- out$apl
last.beta <- out$beta
}
@@ -148,13 +136,13 @@ estimateDisp.default <- function(y, design=NULL, group=NULL, lib.size=NULL, offs
# Calculate prior.df
if(is.null(prior.df)){
- glmfit <- glmFit(sely, offset=seloffset, weight=selweights, design=design, dispersion=disp.trend, prior.count=0)
+ glmfit <- glmFit(y[sel,], design, offset=offset[sel,], dispersion=disp.trend, prior.count=0)
# Residual deviances
df.residual <- glmfit$df.residual
# Adjust df.residual for fitted values at zero
- zerofit <- .areFittedValuesZero(y=glmfit$counts, mu=glmfit$fitted.values)
+ zerofit <- (glmfit$fitted.values < 1e-4) & (glmfit$counts < 1e-4)
df.residual <- .residDF(zerofit, design)
# Empirical Bayes squeezing of the quasi-likelihood variance factors
@@ -279,51 +267,3 @@ WLEB <- function(theta, loglik, prior.n=5, covariate=NULL, trend.method="locfit"
out
}
-
-.subsetMatrixWithoutCopying <- function(x, i, j)
-# This will attempt to subset the matrix without any copying if
-# it detects that 'i' and 'j' don't modify the ordering of the matrix.
-# This reduces the memory footprint for large matrices.
-#
-# written by Aaron Lun
-# created 29 September 2016
-{
- if (is(x, "compressedMatrix")) {
- # Subsetting is ignored if rows/columns are repeated.
- if (attributes(x)$repeat.row) i <- TRUE
- if (attributes(x)$repeat.col) j <- TRUE
- } else if (!is.matrix(x)) {
- stop("'x' must be a matrix for subsetting")
- }
-
- isokay <- TRUE
- if (!missing(i)) {
- # Most flexible way of handling different types of subset vectors;
- # try it out and see if it gives the same results.
- example <- cbind(seq_len(nrow(x)))
- rownames(example) <- rownames(x)
- if (!identical(example, example[i,,drop=FALSE])) isokay <- FALSE
- }
- if (!missing(j)) {
- example <- rbind(seq_len(ncol(x)))
- colnames(example) <- colnames(x)
- if (!identical(example, example[,j,drop=FALSE])) isokay <- FALSE
- }
-
- if (isokay) {
- # Avoids copying if no modification incurred.
- return(x)
- } else {
- return(x[i,j,drop=FALSE])
- }
-}
-
-.areFittedValuesZero <- function(y, mu, tol=1e-4)
-# A slightly more efficient way to check if the fitted values are zero,
-# based on both the count and fitted value being zero for an observation.
-{
- if (!is.double(mu)) storage.mode(mu) <- "double"
- out <- .Call(.cR_check_zero_fitted, y, mu, as.double(tol))
- if (is.character(out)) stop(out)
- return(out)
-}
diff --git a/R/estimateGLMCommonDisp.R b/R/estimateGLMCommonDisp.R
index 68619b4..6b60a9e 100644
--- a/R/estimateGLMCommonDisp.R
+++ b/R/estimateGLMCommonDisp.R
@@ -40,7 +40,7 @@ estimateGLMCommonDisp.default <- function(y, design=NULL, offset=NULL, method="C
if(is.null(offset)) offset <- log(colSums(y))
# Check AveLogCPM
- if(is.null(AveLogCPM)) AveLogCPM <- aveLogCPM(y, offset=offset, weights=weights)
+ if(is.null(AveLogCPM)) AveLogCPM <- aveLogCPM(y,weights=weights)
# Call lower-level function
disp <- switch(method,
diff --git a/R/expandAsMatrix.R b/R/expandAsMatrix.R
index f96c3cd..c71064b 100644
--- a/R/expandAsMatrix.R
+++ b/R/expandAsMatrix.R
@@ -1,7 +1,8 @@
+
expandAsMatrix <- function(x,dim=NULL,byrow=TRUE)
# Convert scalar, row or column vector, or matrix, to be a matrix
-# Gordon Smyth, Yunshun Chen, Aaron Lun
-# 26 Jan 2011. Last modified 03 Oct 2016.
+# Gordon Smyth
+# 26 Jan 2011. Last modified 27 July 2015 (Yunshun Chen).
{
# Check dim argument
if(is.null(dim)) return(as.matrix(x))
@@ -24,19 +25,6 @@ expandAsMatrix <- function(x,dim=NULL,byrow=TRUE)
if(length(dx)<2) stop("x has less than 2 dimensions")
if(length(dx)>2) stop("x has more than 2 dimensions")
if(all(dx==dim)) return(as.matrix(x))
-
-# x is a compressedMatrix
- if(is(x, "compressedMatrix")) {
- if (attributes(x)$repeat.row && dim[2]==ncol(x)) {
- if (!byrow) warning("'byrow=FALSE' is not compatible with compressedMatrix settings")
- return(Recall(as.vector(x),dim=dim,byrow=TRUE))
- } else if (attributes(x)$repeat.col && dim[1]==nrow(x)) {
- if (byrow) warning("'byrow=TRUE' is not compatible with compressedMatrix settings")
- return(Recall(as.vector(x),dim=dim,byrow=FALSE))
- } else if (all(dx==1)) {
- return(Recall(as.vector(x),dim=dim))
- }
- }
stop("x is matrix of wrong size")
}
diff --git a/R/geneset-DGEList.R b/R/geneset-DGEList.R
index 7f2f686..0d74f33 100644
--- a/R/geneset-DGEList.R
+++ b/R/geneset-DGEList.R
@@ -1,37 +1,37 @@
-roast.DGEList <- function(y, index=NULL, design=NULL, contrast=ncol(design), geneid=NULL, set.statistic="mean", gene.weights=NULL,...)
+roast.DGEList <- function(y, index=NULL, design=NULL, contrast=ncol(design), ...)
# Rotation gene set testing for RNA-Seq data
# Yunshun Chen, Gordon Smyth
-# Created 19 Dec 2012. Last revised on 26 Nov 2016
+# Created 19 Dec 2012. Last revised on 27 May 2015
{
y <- .zscoreDGE(y=y, design=design, contrast=contrast)
- roast(y=y, index=index, design=design, contrast=contrast, geneid=geneid, set.statistic=set.statistic, gene.weights=gene.weights, var.prior=1, df.prior=Inf)
-}
+ roast(y=y, index=index, design=design, contrast=contrast, var.prior=1, df.prior=Inf, ...)
+}
-mroast.DGEList <- function(y, index=NULL, design=NULL, contrast=ncol(design), geneid=NULL, set.statistic="mean", gene.weights=NULL, adjust.method="BH", midp=TRUE, sort="directional", ...)
+mroast.DGEList <- function(y, index=NULL, design=NULL, contrast=ncol(design), ...)
# Rotation gene set testing for RNA-Seq data with multiple sets
# Yunshun Chen, Gordon Smyth
-# Created 8 Jan 2013. Last revised 26 Nov 2016.
+# Created 8 Jan 2013. Last revised 27 May 2015.
{
y <- .zscoreDGE(y=y, design=design, contrast=contrast)
- mroast(y=y, index=index, design=design, contrast=contrast, geneid=geneid, set.statistic=set.statistic, gene.weights=gene.weights, var.prior=1, df.prior=Inf, adjust.method=adjust.method, midp=midp, sort=sort)
+ mroast(y=y, index=index, design=design, contrast=contrast, var.prior=1, df.prior=Inf, ...)
}
-fry.DGEList <- function(y, index=NULL, design=NULL, contrast=ncol(design), geneid=NULL, sort="directional", ...)
+fry.DGEList <- function(y, index=NULL, design=NULL, contrast=ncol(design), ...)
# Rotation gene set testing for RNA-Seq data with multiple sets
# Yunshun Chen, Gordon Smyth
-# Created 1 Dec 2015. Last revised 26 Nov 2016.
+# Created 1 Dec 2015. Last revised 15 April 2016.
{
y <- .zscoreDGE(y=y, design=design, contrast=contrast)
- fry(y=y, index=index, design=design, contrast=contrast, standardize="none", geneid=geneid, sort=sort)
+ fry(y=y, index=index, design=design, contrast=contrast, standardize="none", ...)
}
-camera.DGEList <- function(y, index, design=NULL, contrast=ncol(design), weights=NULL, use.ranks=FALSE, allow.neg.cor=FALSE, inter.gene.cor=0.01, sort=TRUE, ...)
+camera.DGEList <- function(y, index, design=NULL, contrast=ncol(design), ...)
# Rotation gene set testing for RNA-Seq data accounting for inter-gene correlation
# Yunshun Chen, Gordon Smyth
-# Created 07 Jan 2013. Last modified 26 Nov 2016.
+# Created 07 Jan 2013. Last modified 27 May 2015.
{
y <- .zscoreDGE(y=y, design=design, contrast=contrast)
- camera(y=y, index=index, design=design, contrast=contrast, weights=weights, use.ranks=use.ranks, allow.neg.cor=allow.neg.cor, inter.gene.cor=inter.gene.cor, trend.var=FALSE, sort=sort)
+ camera(y=y, index=index, design=design, contrast=contrast, ...)
}
romer.DGEList <- function(y, index, design=NULL, contrast=ncol(design), ...)
diff --git a/R/gini.R b/R/gini.R
index 2d7b29a..e376a66 100644
--- a/R/gini.R
+++ b/R/gini.R
@@ -1,15 +1,10 @@
gini <- function(x)
# Gini diversity index for columns of a numeric matrix
# Gordon Smyth
-# Created 5 Feb 2016. Last revised 17 April 2017.
+# 5 Feb 2016
{
- x <- as.matrix(x)
d <- dim(x)
+ if(is.null(d)) d <- dim(x) <- c(length(x),1)
for (j in 1:d[2]) x[,j] <- sort.int(x[,j],na.last=TRUE)
- i <- 1:d[1]
- m <- 0.75*d[1]
- S1 <- colSums((i-m)*x , na.rm=TRUE)
- S2 <- colSums(x, na.rm=TRUE)
- (2*(S1/S2+m)-d[1]-1)/d[1]
+ (2*colSums((1:d[1])*x)/colSums(x)-d[1]-1)/d[1]
}
-
diff --git a/R/glmQLFTest.R b/R/glmQLFTest.R
index 245282b..cb41005 100644
--- a/R/glmQLFTest.R
+++ b/R/glmQLFTest.R
@@ -5,20 +5,18 @@ UseMethod("glmQLFit")
glmQLFit.DGEList <- function(y, design=NULL, dispersion=NULL, offset=NULL, abundance.trend=TRUE, robust=FALSE, winsor.tail.p=c(0.05, 0.1), ...)
# Yunshun Chen and Aaron Lun
-# Created 05 November 2014. Last modified 03 Oct 2016.
+# Created 05 November 2014.
{
- if(is.null(design)) design <- model.matrix(~y$samples$group)
if(is.null(dispersion)) {
dispersion <- y$trended.dispersion
if(is.null(dispersion)) dispersion <- y$common.dispersion
- if(is.null(dispersion)) stop("dispersion value not supplied")
+ if(is.null(dispersion)) dispersion <- 0.05
}
if(is.null(dispersion)) stop("No dispersion values found in DGEList object.")
if(is.null(offset)) offset <- getOffset(y)
if(is.null(y$AveLogCPM)) y$AveLogCPM <- aveLogCPM(y)
- fit <- glmQLFit(y=y$counts, design=design, dispersion=dispersion, offset=offset, lib.size=NULL, abundance.trend=abundance.trend,
- AveLogCPM=y$AveLogCPM, robust=robust, winsor.tail.p=winsor.tail.p, weights=y$weights, ...)
+ fit <- glmQLFit(y=y$counts, design=design, dispersion=dispersion, offset=offset, lib.size=NULL, abundance.trend=abundance.trend, AveLogCPM=y$AveLogCPM, robust=robust, winsor.tail.p=winsor.tail.p, weights=y$weights, ...)
fit$samples <- y$samples
fit$genes <- y$genes
# fit$prior.df <- y$prior.df
@@ -26,17 +24,42 @@ glmQLFit.DGEList <- function(y, design=NULL, dispersion=NULL, offset=NULL, abund
new("DGEGLM",fit)
}
-glmQLFit.default <- function(y, design=NULL, dispersion=NULL, offset=NULL, lib.size=NULL, weights=NULL,
- abundance.trend=TRUE, AveLogCPM=NULL, robust=FALSE, winsor.tail.p=c(0.05, 0.1), ...)
+glmQLFit.default <- function(y, design=NULL, dispersion=NULL, offset=NULL, lib.size=NULL, abundance.trend=TRUE, AveLogCPM=NULL, robust=FALSE, winsor.tail.p=c(0.05, 0.1), ...)
# Fits a GLM and computes quasi-likelihood dispersions for each gene.
# Davis McCarthy, Gordon Smyth, Yunshun Chen, Aaron Lun.
-# Originally part of glmQLFTest, as separate function 15 September 2014. Last modified 03 October 2016.
+# Originally part of glmQLFTest, as separate function 15 September 2014. Last modified 05 November 2014.
{
- glmfit <- glmFit(y, design=design, dispersion=dispersion, offset=offset, lib.size=lib.size, weights=weights,...)
+# Check y
+ y <- as.matrix(y)
+ ntag <- nrow(y)
+ nlib <- ncol(y)
+
+# Check design
+ if(is.null(design)) {
+ design <- matrix(1,ncol(y),1)
+ rownames(design) <- colnames(y)
+ colnames(design) <- "Intercept"
+ } else {
+ design <- as.matrix(design)
+ ne <- nonEstimable(design)
+ if(!is.null(ne)) stop(paste("Design matrix not of full rank. The following coefficients not estimable:\n", paste(ne, collapse = " ")))
+ }
+
+# Check dispersion
+ if(is.null(dispersion)) stop("No dispersion values provided.")
+
+# Check offset and lib.size
+ if(is.null(offset)) {
+ if(is.null(lib.size)) lib.size <- colSums(y)
+ offset <- log(lib.size)
+ }
+ offset <- expandAsMatrix(offset,dim(y))
+
+ glmfit <- glmFit(y,design=design,dispersion=dispersion,offset=offset,lib.size=lib.size,...)
# Setting up the abundances.
if(abundance.trend) {
- if(is.null(AveLogCPM)) AveLogCPM <- aveLogCPM(y, lib.size=lib.size, weights=weights, dispersion=dispersion)
+ if(is.null(AveLogCPM)) AveLogCPM <- aveLogCPM(y)
glmfit$AveLogCPM <- AveLogCPM
} else {
AveLogCPM <- NULL
@@ -44,7 +67,7 @@ glmQLFit.default <- function(y, design=NULL, dispersion=NULL, offset=NULL, lib.s
# Adjust df.residual for fitted values at zero
zerofit <- (glmfit$fitted.values < 1e-4) & (glmfit$counts < 1e-4)
- df.residual <- .residDF(zerofit, glmfit$design)
+ df.residual <- .residDF(zerofit, design)
# Empirical Bayes squeezing of the quasi-likelihood variance factors
s2 <- glmfit$deviance / df.residual
@@ -64,9 +87,9 @@ glmQLFit.default <- function(y, design=NULL, dispersion=NULL, offset=NULL, lib.s
glmQLFTest <- function(glmfit, coef=ncol(glmfit$design), contrast=NULL, poisson.bound=TRUE)
# Quasi-likelihood F-tests for DGE glms.
# Davis McCarthy, Gordon Smyth, Aaron Lun.
-# Created 18 Feb 2011. Last modified 04 Oct 2016.
+# Created 18 Feb 2011. Last modified 28 Jan 2016.
{
- if(!is(glmfit,"DGEGLM")) stop("glmfit must be an DGEGLM object produced by glmQLFit")
+ if(!is(glmfit,"DGEGLM")) stop("glmfit must be an DGEGLM object produced by glmQLFit")
if(is.null(glmfit$var.post)) stop("need to run glmQLFit before glmQLFTest")
out <- glmLRT(glmfit, coef=coef, contrast=contrast)
@@ -81,7 +104,7 @@ glmQLFTest <- function(glmfit, coef=ncol(glmfit$design), contrast=NULL, poisson.
# Ensure is not more significant than chisquare test with Poisson variance
if(poisson.bound) {
- i <- .isBelowPoissonBound(glmfit)
+ i <- rowSums(glmfit$var.post * (1 + glmfit$fitted.values * glmfit$dispersion) < 1) > 0L
if(any(i)) {
pois.fit <- glmfit[i,]
pois.fit <- glmFit(pois.fit$counts, design=pois.fit$design, offset=pois.fit$offset, weights=pois.fit$weights, start=pois.fit$unshrunk.coefficients, dispersion=0)
@@ -98,16 +121,6 @@ glmQLFTest <- function(glmfit, coef=ncol(glmfit$design), contrast=NULL, poisson.
out
}
-.isBelowPoissonBound <- function(glmfit)
-# A convenience function to avoid generating temporary matrices.
-{
- disp <- makeCompressedMatrix(glmfit$dispersion, byrow=FALSE)
- s2 <- makeCompressedMatrix(glmfit$var.post, byrow=FALSE)
- out <- .Call(.cR_check_poisson_bound, glmfit$fitted.values, disp, s2)
- if (is.character(out)) stop(out)
- return(out)
-}
-
plotQLDisp <- function(glmfit, xlab="Average Log2 CPM", ylab="Quarter-Root Mean Deviance", pch=16, cex=0.2, col.shrunk="red", col.trend="blue", col.raw="black", ...)
# Plots the result of QL-based shrinkage.
# Davis McCarthy, Gordon Smyth, Aaron Lun, Yunshun Chen.
diff --git a/R/glmTreat.R b/R/glmTreat.R
index 7a5ba12..fa70ec3 100644
--- a/R/glmTreat.R
+++ b/R/glmTreat.R
@@ -1,13 +1,18 @@
-glmTreat <- function(glmfit, coef=ncol(glmfit$design), contrast=NULL, lfc=0, null="interval")
+treatDGE <- function(glmfit, coef=ncol(glmfit$design), contrast=NULL, lfc=0)
+{
+ message("treatDGE() has been renamed to glmTreat(). Please use the latter instead.")
+ glmTreat(glmfit=glmfit,coef=coef,contrast=contrast,lfc=lfc)
+}
+
+glmTreat <- function(glmfit, coef=ncol(glmfit$design), contrast=NULL, lfc=0)
# Likelihood ratio test or quasi-likelihood F-test with a threshold
# Yunshun Chen and Gordon Smyth
-# Created on 05 May 2014. Last modified on 16 Nov 2016
+# Created on 05 May 2014. Last modified on 29 April 2015
{
if(lfc < 0) stop("lfc has to be non-negative")
-
+
# Check if glmfit is from glmFit() or glmQLFit()
isLRT <- is.null(glmfit$df.prior)
- Fit <- ifelse(isLRT, glmFit, glmQLFit)
# Switch to glmLRT() or glmQLFTest() if lfc is zero
if(lfc==0) {
@@ -27,6 +32,7 @@ glmTreat <- function(glmfit, coef=ncol(glmfit$design), contrast=NULL, lfc=0, nul
stop("glmfit must be an DGEGLM object (usually produced by glmFit or glmQLFit).")
}
if(is.null(glmfit$AveLogCPM)) glmfit$AveLogCPM <- aveLogCPM(glmfit)
+ nlibs <- ncol(glmfit)
ngenes <- nrow(glmfit)
# Check design matrix
@@ -68,69 +74,78 @@ glmTreat <- function(glmfit, coef=ncol(glmfit$design), contrast=NULL, lfc=0, nul
design <- reform$design
}
unshrunk.logFC <- as.vector(unshrunk.logFC)
+ up <- unshrunk.logFC >= 0
# Null design matrix
design0 <- design[, -coef, drop=FALSE]
-# Offset adjustment
- offset.old <- makeCompressedMatrix(glmfit$offset, byrow=TRUE)
- offset.adj <- makeCompressedMatrix(lfc*log(2) * design[, coef], byrow=TRUE)
-
-# Test statistics at beta_0 = tau
- offset.new <- .addCompressedMatrices(offset.old, offset.adj)
- fit0 <- Fit(glmfit$counts, design=design0, offset=offset.new, weights=glmfit$weights, dispersion=glmfit$dispersion, prior.count=0)
- fit1 <- Fit(glmfit$counts, design=design, offset=offset.new, weights=glmfit$weights, dispersion=glmfit$dispersion, prior.count=0)
- z.left <- sqrt( pmax(0, fit0$deviance - fit1$deviance) )
-
-# Test statistics at beta_0 = -tau
- offset.new <- .addCompressedMatrices(offset.old, -offset.adj)
- fit0 <- Fit(glmfit$counts, design=design0, offset=offset.new, weights=glmfit$weights, dispersion=glmfit$dispersion, prior.count=0)
- fit1 <- Fit(glmfit$counts, design=design, offset=offset.new, weights=glmfit$weights, dispersion=glmfit$dispersion, prior.count=0)
- z.right <- sqrt( pmax(0, fit0$deviance - fit1$deviance) )
-
-# Make sure z.left < z.right
- i <- z.left > z.right
- if(any(i)) {
- tmp <- z.left[i]
- z.left[i] <- z.right[i]
- z.right[i] <- tmp
- }
+ if(isLRT) {
+# Adjusted offset
+ offset.adj <- matrix(-lfc*log(2), ngenes, 1)
+ offset.adj[up, ] <- lfc*log(2)
+
+# Test statistics at beta_0 = tau
+ offset.new <- glmfit$offset + offset.adj %*% t(design[, coef, drop=FALSE])
+ fit0.tau <- glmFit(glmfit$counts, design=design0, offset=offset.new, weights=glmfit$weights, dispersion=glmfit$dispersion, prior.count=0)
+ fit1.tau <- glmFit(glmfit$counts, design=design, offset=offset.new, weights=glmfit$weights, dispersion=glmfit$dispersion, prior.count=0)
+ X2.tau <- pmax(0, fit0.tau$deviance - fit1.tau$deviance)
+ z.tau <- sqrt(X2.tau)
+
+# Test statistics at beta_0 = -tau
+ offset.new <- glmfit$offset - offset.adj %*% t(design[, coef, drop=FALSE])
+ fit0.tau <- glmFit(glmfit$counts, design=design0, offset=offset.new, weights=glmfit$weights, dispersion=glmfit$dispersion, prior.count=0)
+ fit1.tau <- glmFit(glmfit$counts, design=design, offset=offset.new, weights=glmfit$weights, dispersion=glmfit$dispersion, prior.count=0)
+ X2.tau2 <- pmax(0, fit0.tau$deviance - fit1.tau$deviance)
+ z.tau2 <- sqrt(X2.tau2)
+
+ within <- abs(unshrunk.logFC) <= lfc
+ sgn <- 2*within - 1
+
+# Integral of Normal CDF from a to b
+ fun <- function(a,b) ifelse(a==b, pnorm(a), ( b*pnorm(b)+dnorm(b) - (a*pnorm(a)+dnorm(a)) )/(b-a) )
+ p.value <- 2*fun(-z.tau2, z.tau*sgn)
+ } else {
+# Guass quadrature
+ if(!requireNamespace("statmod",quietly=TRUE)) stop("statmod required but is not available")
+ nnodes <- 5
+ gquad <- statmod::gauss.quad.prob(nnodes, dist="uniform", l=0, u=lfc*log(2))
+
+ offset.adj <- X2.pos <- X2.neg <- matrix(-gquad$nodes, ngenes, nnodes, byrow=TRUE)
+ offset.adj[up, ] <- -offset.adj[up, ]
+
+ for(k in 1:nnodes){
+# Test statistics at beta_0 = pos_nodes (tau)
+ offset.new <- glmfit$offset + offset.adj[,k] %*% t(design[, coef, drop=FALSE])
+ fit0.pos <- glmQLFit(glmfit$counts, design=design0, offset=offset.new, weights=glmfit$weights, dispersion=glmfit$dispersion, prior.count=0)
+ fit1.pos <- glmQLFit(glmfit$counts, design=design, offset=offset.new, weights=glmfit$weights, dispersion=glmfit$dispersion, prior.count=0)
+ X2.pos[,k] <- pmax(0, fit0.pos$deviance - fit1.pos$deviance)
+
+# Test statistics at beta_0 = neg_nodes (-tau)
+ offset.new <- glmfit$offset - offset.adj[,k] %*% t(design[, coef, drop=FALSE])
+ fit0.neg <- glmQLFit(glmfit$counts, design=design0, offset=offset.new, weights=glmfit$weights, dispersion=glmfit$dispersion, prior.count=0)
+ fit1.neg <- glmQLFit(glmfit$counts, design=design, offset=offset.new, weights=glmfit$weights, dispersion=glmfit$dispersion, prior.count=0)
+ X2.neg[,k] <- pmax(0, fit0.neg$deviance - fit1.neg$deviance)
+ }
+ z.pos <- sqrt(X2.pos)
+ z.neg <- sqrt(X2.neg)
+
+ within <- matrix(abs(unshrunk.logFC)*log(2), ngenes, nnodes) <= abs(offset.adj)
+ sgn <- 2*within - 1
-# Convert t to z under the QL pipeline
- if(!isLRT){
+# Calculate expected p-values by Gauss quadrature
+ t.pos <- z.pos/sqrt(glmfit$var.post)
+ t.neg <- z.neg/sqrt(glmfit$var.post)
df.total <- glmfit$df.prior + glmfit$df.residual.zeros
max.df.residual <- ncol(glmfit$counts)-ncol(glmfit$design)
df.total <- pmin(df.total, nrow(glmfit)*max.df.residual)
- z.left <- limma::zscoreT(z.left/sqrt(glmfit$var.post), df=df.total)
- z.right <- limma::zscoreT(z.right/sqrt(glmfit$var.post), df=df.total)
- }
-
- within <- abs(unshrunk.logFC) <= lfc
- sgn <- 2*within - 1
- z.left <- z.left*sgn
-
- null <- match.arg(null, c("interval", "worst.case"))
- if(null=="interval") {
-# Interval threshold
- c <- 1.470402
- j <- z.right + z.left > c
- p.value <- rep_len(1L, ngenes)
- j <- j[!is.na(j)]
- p.value[j] <- .integratepnorm(-z.right[j], -z.right[j] + c) + .integratepnorm(z.left[j] - c, z.left[j])
- p.value[!j] <- 2*.integratepnorm(-z.right[!j], z.left[!j])
- } else {
- p.value <- pnorm(-z.right) + pnorm(z.left)
- }
+ p.value <- ( pt(t.pos*sgn, df=df.total) + pt(t.neg, df=df.total, lower.tail=FALSE) ) %*% gquad$weights
-# Ensure it is not more significant than chisquare test with Poisson variance
- if(!isLRT){
- i <- rowSums(glmfit$var.post * (1 + glmfit$fitted.values * glmfit$dispersion) < 1) > 0L
+# Ensure is not more significant than z-test
+ i <- glmfit$var.post < 1
if(any(i)) {
- pois.fit <- glmfit[i,]
- pois.fit <- glmFit(pois.fit$counts, design=pois.fit$design, offset=pois.fit$offset, weights=pois.fit$weights, start=pois.fit$unshrunk.coefficients, dispersion=0)
- pois.res <- Recall(pois.fit, coef=coef, contrast=contrast, lfc=lfc)
- p.value[i] <- pmax(p.value[i], pois.res$table$PValue)
- }
+ z.pvalue <- ( pnorm(z.pos[i,]*sgn[i,]) + pnorm(z.neg[i,], lower.tail=FALSE) ) %*% gquad$weights
+ p.value[i] <- pmax(p.value[i], z.pvalue)
+ }
}
# Table output
@@ -151,6 +166,3 @@ glmTreat <- function(glmfit, coef=ncol(glmfit$design), contrast=NULL, lfc=0, nul
glmfit$comparison <- coef.name
new("DGELRT",unclass(glmfit))
}
-
-
-.integratepnorm <- function(a, b) ifelse(a==b, pnorm(a), ( b*pnorm(b)+dnorm(b) - (a*pnorm(a)+dnorm(a)) )/(b-a) )
diff --git a/R/glmfit.R b/R/glmfit.R
index 1bad913..48e0928 100644
--- a/R/glmfit.R
+++ b/R/glmfit.R
@@ -4,9 +4,8 @@ glmFit <- function(y, ...)
UseMethod("glmFit")
glmFit.DGEList <- function(y, design=NULL, dispersion=NULL, prior.count=0.125, start=NULL, ...)
-# Created 11 May 2011. Last modified 14 Aug 2016.
+# Created 11 May 2011. Last modified 11 March 2013.
{
- if(is.null(design)) design <- model.matrix(~y$samples$group)
if(is.null(dispersion)) dispersion <- getDispersion(y)
if(is.null(dispersion)) stop("No dispersion values found in DGEList object.")
@@ -24,7 +23,7 @@ glmFit.default <- function(y, design=NULL, dispersion=NULL, offset=NULL, lib.siz
# Fit negative binomial generalized linear model for each transcript
# to a series of digital expression libraries
# Davis McCarthy and Gordon Smyth
-# Created 17 August 2010. Last modified 03 Oct 2016.
+# Created 17 August 2010. Last modified 18 May 2015.
{
# Check y
y <- as.matrix(y)
@@ -44,10 +43,16 @@ glmFit.default <- function(y, design=NULL, dispersion=NULL, offset=NULL, lib.siz
# Check dispersion
if(is.null(dispersion)) stop("No dispersion values provided.")
- dispersion.mat <- makeCompressedMatrix(dispersion, byrow=FALSE)
+ dispersion.mat <- expandAsMatrix(dispersion, dim(y), byrow=FALSE)
# Check offset and lib.size
- offset <- .compressOffsets(y=y, lib.size=lib.size, offset=offset)
+ if(is.null(offset)) {
+ if(is.null(lib.size)) lib.size <- colSums(y)
+ if(any(lib.size==0L)) stop("Zero library size detected. Remove the empty libraries before proceeding to the next step.")
+ offset <- log(lib.size)
+ }
+ if(any(is.infinite(offset))) stop("Infinite offset value detected. Check the input offset.")
+ offset <- expandAsMatrix(offset,dim(y))
# weights are checked in lower-level functions
@@ -88,7 +93,7 @@ glmFit.default <- function(y, design=NULL, dispersion=NULL, offset=NULL, lib.siz
glmLRT <- function(glmfit,coef=ncol(glmfit$design),contrast=NULL)
# Tagwise likelihood ratio tests for DGEGLM
# Gordon Smyth, Davis McCarthy and Yunshun Chen.
-# Created 1 July 2010. Last modified 21 March 2017.
+# Created 1 July 2010. Last modified 11 June 2015.
{
# Check glmfit
if(!is(glmfit,"DGEGLM")) {
@@ -126,9 +131,10 @@ glmLRT <- function(glmfit,coef=ncol(glmfit$design),contrast=NULL)
ncontrasts <- qrc$rank
if(ncontrasts==0) stop("contrasts are all zero")
coef <- 1:ncontrasts
+ if(ncontrasts < ncol(contrast)) contrast <- contrast[,qrc$pivot[coef]]
logFC <- drop((glmfit$coefficients %*% contrast)/log(2))
if(ncontrasts>1) {
- coef.name <- paste("LR test on",ncontrasts,"degrees of freedom")
+ coef.name <- paste("LR test of",ncontrasts,"contrasts")
} else {
contrast <- drop(contrast)
i <- contrast!=0
diff --git a/R/locfitByCol.R b/R/locfitByCol.R
index dcc6ebf..b38dcbc 100644
--- a/R/locfitByCol.R
+++ b/R/locfitByCol.R
@@ -1,12 +1,13 @@
locfitByCol <- function(y, x=NULL, weights=1, span=0.5, degree=0)
# Gordon Smyth
-# 20 Aug 2012. Last modified 15 June 2016.
+# 20 Aug 2012. Last modified 2 Feb 2015.
{
y <- as.matrix(y)
ntags <- nrow(y)
weights <- rep_len(weights,ntags)
if(is.null(x)) x <- 1:ntags
if(span*ntags<2 || ntags<=1) return(y)
- for (j in 1:ncol(y)) y[,j] <- fitted(locfit(y[,j]~x,weights=weights, alpha=span,deg=degree))
+ if(!requireNamespace("locfit",quietly=TRUE)) stop("locfit required but is not available")
+ for (j in 1:ncol(y)) y[,j] <- fitted(locfit::locfit(y[,j]~x,weights=weights, alpha=span,deg=degree))
y
}
diff --git a/R/makeCompressedMatrix.R b/R/makeCompressedMatrix.R
deleted file mode 100644
index b39df76..0000000
--- a/R/makeCompressedMatrix.R
+++ /dev/null
@@ -1,164 +0,0 @@
-makeCompressedMatrix <- function(x, byrow=TRUE)
-# Coerces a NULL, scalar, vector or matrix to a compressed matrix,
-# Determines whether the rows or columns are intended to be
-# repeated, and stores this in the attributes.
-#
-# written by Aaron Lun
-# created 24 September 2016
-# last modified 27 September 2016
-{
- if (is.matrix(x)) {
- if (is(x, "compressedMatrix")) {
- return(x)
- }
- repeat.row <- repeat.col <- FALSE
- } else if (length(x)==1L) {
- repeat.row <- repeat.col <- TRUE
- x <- matrix(x)
- } else {
- if (!byrow) {
- x <- cbind(x)
- repeat.row <- FALSE
- repeat.col <- TRUE
- } else {
- x <- rbind(x)
- repeat.row <- TRUE
- repeat.col <- FALSE
- }
- }
- class(x) <- "compressedMatrix"
- attributes(x)$repeat.row <- repeat.row
- attributes(x)$repeat.col <- repeat.col
- return(x)
-}
-
-`[.compressedMatrix` <- function(x, i, j, ...)
-# A wrapper function to easily subset a makeCompressedMatrix object.
-#
-# written by Aaron Lun
-# created 24 September 2016
-{
- row.status <- attributes(x)$repeat.row
- col.status <- attributes(x)$repeat.col
- oldclass <- class(x)
- x <- unclass(x)
- if (!row.status && !missing(i)) {
- x <- x[i,,drop=FALSE]
- }
- if (!col.status && !missing(j)) {
- x <- x[,j,drop=FALSE]
- }
- class(x) <- oldclass
- attributes(x)$repeat.row <- row.status
- attributes(x)$repeat.col <- col.status
- return(x)
-}
-
-as.matrix.compressedMatrix <- function(x, ...)
-# Getting rid of odds and ends.
-#
-# written by Aaron Lun
-# created 26 September 2016
-{
- attributes(x)$repeat.row <- NULL
- attributes(x)$repeat.col <- NULL
- unclass(x)
-}
-
-.addCompressedMatrices <- function(x, y)
-# A function that performs addition of two compressedMatrix objects,
-# in a manner that best preserves memory usage.
-#
-# written by Aaron Lun
-# created 26 September 2016
-# last modified 27 September 2016
-{
- if (!is(x, "compressedMatrix") || !is(y, "compressedMatrix")) {
- stop("only two compressedMatrix objects can be added")
- }
- dims <- pmax(dim(x), dim(y))
- out <- .Call(.cR_add_repeat_matrices, x, y, dims[1], dims[2])
- if (is.character(out)) stop(out)
-
- summed <- out[[1]]
- class(summed) <- class(x)
- attributes(summed)$repeat.row <- out[[2]]
- attributes(summed)$repeat.col <- out[[3]]
- return(summed)
-}
-
-.compressOffsets <- function(y, offset, lib.size=NULL)
-# A convenience function to avoid repeatedly having to write the code below.
-# If provided, offsets take precedence over the library size.
-# If neither are provided, library sizes are automatically computed
-# as the sum of counts in the count matrix 'y'.
-# A prefunctory check for finite values is performed in the C++ code.
-# If 'offset' is already of the compressedMatrix class, then
-# we assume it's already gone through this once so we don't do it again.
-{
- if (is(offset, "compressedMatrix")) {
- return(offset)
- }
-
- if (is.null(offset)) {
- if (is.null(lib.size)) lib.size <- colSums(y)
- offset <- log(lib.size)
- }
- if (!is.double(offset)) storage.mode(offset) <- "double"
- offset <- makeCompressedMatrix(offset, byrow=TRUE)
-
- err <- .Call(.cR_check_finite, offset, "offsets")
- if (is.character(err)) stop(err)
- return(offset)
-}
-
-.compressWeights <- function(weights=NULL)
-# A convenience function to avoid repeatedly having to write the code below.
-# All weights default to 1 if not specified.
-# A prefunctory check for finite, positive values is performed in the C++ code.
-# If 'weights' is already a compressedMatrix, then we assume it's
-# already gone through this and don't do it again.
-{
- if (is(weights, "compressedMatrix")) {
- return(weights)
- }
-
- if (is.null(weights)) weights <- 1
- if (!is.double(weights)) storage.mode(weights) <- "double"
- weights <- makeCompressedMatrix(weights, byrow=TRUE)
-
- err <- .Call(.cR_check_positive, weights, "weights")
- if (is.character(err)) stop(err)
- return(weights)
-}
-
-.compressPrior <- function(prior.count)
-# Again for the prior counts, checking for non-negative finite values.
-# Skipping the check if it's already a compressedMatrix object.
-{
- if (is(prior.count, "compressedMatrix")) {
- return(prior.count)
- }
-
- if(!is.double(prior.count)) storage.mode(prior.count) <- "double"
- prior.count <- makeCompressedMatrix(prior.count, byrow=FALSE)
- err <- .Call(.cR_check_nonnegative, prior.count, "prior counts")
- if (is.character(err)) stop(err)
- return(prior.count)
-}
-
-.compressDispersions <- function(dispersion)
-# Again for the dispersions, checking for non-negative finite values.
-# Skipping the check if it's already a compressedMatrix object.
-{
- if (is(dispersion, "compressedMatrix")) {
- return(dispersion)
- }
-
- if(!is.double(dispersion)) storage.mode(dispersion) <- "double"
- dispersion <- makeCompressedMatrix(dispersion, byrow=FALSE)
- err <- .Call(.cR_check_nonnegative, dispersion, "dispersions")
- if (is.character(err)) stop(err)
- return(dispersion)
-}
-
diff --git a/R/mglmLevenberg.R b/R/mglmLevenberg.R
index b819a5d..7044fee 100644
--- a/R/mglmLevenberg.R
+++ b/R/mglmLevenberg.R
@@ -4,46 +4,70 @@ mglmLevenberg <- function(y, design, dispersion=0, offset=0, weights=NULL, coef.
# R version by Gordon Smyth and Yunshun Chen
# C++ version by Aaron Lun
-# Created 3 March 2011. Last modified 03 Oct 2016
+# Created 3 March 2011. Last modified 11 July 2012
{
# Check arguments
y <- as.matrix(y)
if(!is.numeric(y)) stop("y is non-numeric")
+ if(any(y<0)) stop("y must be non-negative")
nlibs <- ncol(y)
ngenes <- nrow(y)
if(nlibs==0 || ngenes==0) stop("no data")
-# Checks for negative, NA or non-finite values in the count matrix.
- .isAllZero(y)
-
-# Checking the design matrix
+ if(!( all(is.finite(y)) || all(is.finite(design)) )) stop("All values must be finite and non-missing")
design <- as.matrix(design)
- if (!is.double(design)) storage.mode(design) <- "double"
- if (!all(is.finite(design))) stop("all entries of design matrix must be finite and non-missing")
-
-# Checking dispersions, offsets and weights
- offset <- .compressOffsets(y, offset=offset)
- dispersion <- .compressDispersions(dispersion)
- weights <- .compressWeights(weights)
+ dispersion <- expandAsMatrix(dispersion, dim(y), byrow=FALSE)
-# Initializing values for the coefficients at reasonable best guess with linear models.
if(is.null(coef.start)) {
start.method <- match.arg(start.method, c("null","y"))
- beta <- .Call(.cR_get_levenberg_start, y, offset, dispersion, weights, design, start.method=="null")
- if (is.character(beta)) stop(beta)
+ if(start.method=="null") N <- exp(offset)
} else {
- beta <- as.matrix(coef.start)
+ coef.start <- as.matrix(coef.start)
}
+
+ offset <- t(expandAsMatrix(offset,dim(y)))
-# Checking arguments and calling the C++ method.
+# Check weights
+ if(is.null(weights)) weights <- 1
+ weights <- t(expandAsMatrix(weights,dim(y)))
+
+# Initializing if desired. Note that lm.fit can fit in a vectorised manner,
+# where each column of the input matrix is a separate set of observations.
+ if(is.null(coef.start)) {
+ if(start.method=="y") {
+ delta <- min(max(y), 1/6)
+ y1 <- pmax(y, delta)
+ fit <- lm.wfit(design, t(log(y1)) - offset, weights)
+ beta <- fit$coefficients
+ mu <- exp(beta + offset)
+ } else {
+ N <- expandAsMatrix(N,dim(y))
+ w <- t(weights) * N/(1+dispersion*N)
+ beta.mean <- log(rowSums(y*w/N)/rowSums(w))
+ beta <- qr.coef(qr(design), matrix(beta.mean,nrow=nlibs,ncol=ngenes,byrow=TRUE))
+ mu <- exp(design %*% beta + offset)
+ }
+ } else {
+ beta <- t(coef.start)
+ mu <- exp(design %*% beta + offset)
+ }
+
+# Checking arguments and calling the C++ method. We use transposed matrices so that each can be accessed from column-major storage in C++.
+ if (!is.double(design)) storage.mode(design) <- "double"
+ if (!is.double(dispersion)) storage.mode(dispersion) <- "double"
+ if (!is.double(offset)) storage.mode(offset) <- "double"
+ if (!is.double(weights)) storage.mode(weights) <- "double"
if (!is.double(beta)) storage.mode(beta) <- "double"
- output <- .Call(.cR_levenberg, design, y, dispersion, offset, weights, beta, tol, maxit)
+ if (!is.double(mu)) storage.mode(mu) <- "double"
+ output <- .Call(.cR_levenberg, nlibs, ngenes, design, t(y), t(dispersion), offset, weights, beta, mu, tol, maxit)
# Check for error condition
if (is.character(output)) { stop(output) }
# Naming the output and returning it.
names(output) <- c("coefficients", "fitted.values", "deviance", "iter", "failed")
+ output$coefficients <- t(output$coefficients)
+ output$fitted.values <- t(output$fitted.values)
colnames(output$coefficients) <- colnames(design)
rownames(output$coefficients) <- rownames(y)
dimnames(output$fitted.values) <- dimnames(y)
diff --git a/R/mglmOneGroup.R b/R/mglmOneGroup.R
index 78091eb..036d651 100644
--- a/R/mglmOneGroup.R
+++ b/R/mglmOneGroup.R
@@ -1,29 +1,47 @@
mglmOneGroup <- function(y,dispersion=0,offset=0,weights=NULL,maxit=50,tol=1e-10,verbose=FALSE,coef.start=NULL)
# Fit single-group negative-binomial glm
# Aaron Lun and Gordon Smyth
-# 18 Aug 2010. Last modified 03 Oct 2016.
+# 18 Aug 2010. Last modified 11 Sep 2014.
{
# Check y
y <- as.matrix(y)
if(!is.numeric(y)) stop("y is non-numeric")
- .isAllZero(y)
+ if(any(y<0)) stop("y must be non-negative")
+ ntags <- nrow(y)
+ nlibs <- ncol(y)
# Check dispersion
- dispersion <- .compressDispersions(dispersion)
+ dispersion <- expandAsMatrix(dispersion, dim(y), byrow=FALSE)
+ if(typeof(dispersion) != "double") stop("dispersion not floating point number")
+ if(any(dispersion<0)) stop("dispersion must be non-negative")
# Check offset
- offset <- .compressOffsets(y, offset=offset)
+ if(typeof(offset) != "double") stop("offset not floating point number")
# Check starting values
- if (is.null(coef.start)) coef.start <- NA_real_
- if (!is.double(coef.start)) storage.mode(coef.start) <- "double"
- coef.start <- makeCompressedMatrix(coef.start, byrow=FALSE)
+ if (typeof(coef.start) != "double") storage.mode(coef.start) <- "double"
+
+# All-Poisson special case
+ if(all(dispersion==0) && is.null(weights)) {
+ N <- exp(offset)
+ if(is.null(dim(N)))
+ m <- mean(N)
+ else
+ m <- .rowMeans(N,ntags,nlibs)
+ return(log(.rowMeans(y/m, ntags, nlibs)))
+ }
# Check weights
- weights <- .compressWeights(weights)
+ if(is.null(weights)) weights=1
+ if(typeof(weights) == "integer") storage.mode(weights) <- "double"
+ if(typeof(weights) != "double") stop("weights is non-numeric")
+
+# Expansions to full dimensions
+ offset <- expandAsMatrix(offset,dim(y))
+ weights <- expandAsMatrix(weights,dim(y))
# Fisher scoring iteration.
- output <- .Call(.cR_one_group, y, dispersion, offset, weights, maxit, tol, coef.start)
+ output <- .Call(.cR_one_group, nlibs, ntags, t(y), t(dispersion), offset, weights, maxit, tol, coef.start)
# Check error condition
if(is.character(output)) stop(output)
diff --git a/R/mglmOneWay.R b/R/mglmOneWay.R
index 2e5e03b..8641a58 100644
--- a/R/mglmOneWay.R
+++ b/R/mglmOneWay.R
@@ -15,7 +15,7 @@ mglmOneWay <- function(y,design=NULL,dispersion=0,offset=0,weights=NULL,maxit=50
# by Fisher scoring with
# only a single explanatory factor in the model
# Gordon Smyth
-# 11 March 2011. Last modified 03 Oct 2016.
+# 11 March 2011. Last modified 11 Sep 2014.
{
y <- as.matrix(y)
ntags <- nrow(y)
@@ -29,30 +29,25 @@ mglmOneWay <- function(y,design=NULL,dispersion=0,offset=0,weights=NULL,maxit=50
}
ngroups <- length(levels(group))
stopifnot(ncol(design)==ngroups)
- groupbeta <- matrix(0,ntags,ngroups)
-
- offset <- .compressOffsets(y, offset=offset)
- dispersion <- .compressDispersions(dispersion)
- weights <- .compressWeights(weights)
+ mu <- matrix(0,ntags,ngroups)
+ offset <- expandAsMatrix(offset,dim(y))
+ dispersion <- expandAsMatrix(dispersion, dim(y), byrow=FALSE)
+ if(!is.null(weights)) weights <- expandAsMatrix(weights,dim(y))
firstjofgroup <- rep(0,ngroups)
new.start <- NULL
- for (g in seq_len(ngroups)) {
+ for (g in 1:ngroups) {
j <- which(group==(levels(group)[g]))
firstjofgroup[g] <- j[1]
if (!is.null(coef.start)) { new.start <- coef.start %*% design[firstjofgroup[g],] }
- groupbeta[,g] <- mglmOneGroup(y[,j,drop=FALSE], dispersion=dispersion[,j,drop=FALSE], offset=offset[,j,drop=FALSE],
- weights=weights[,j,drop=FALSE], maxit=maxit,tol=tol, coef.start=new.start)
+ mu[,g] <- mglmOneGroup(y[,j,drop=FALSE],dispersion=dispersion[,j,drop=FALSE],offset=offset[,j,drop=FALSE],weights=weights[,j,drop=FALSE],maxit=maxit,tol=tol,
+ coef.start=new.start)
}
- # Computing the fitted values from the group-wise beta's.
- mu <- .Call(.cR_get_one_way_fitted, groupbeta, offset, as.integer(group)-1L)
- if (is.character(mu)) stop(mu)
-
- # Reformatting the beta's to reflect the original design.
designunique <- design[firstjofgroup,,drop=FALSE]
- groupbeta <- pmax(groupbeta,-1e8)
- beta <- t(solve(designunique,t(groupbeta)))
-
+ mu1 <- pmax(mu,-1e8)
+ beta <- t(solve(designunique,t(mu1)))
+ mu <- mu[,group,drop=FALSE]
+ mu <- exp(mu+offset)
list(coefficients=beta,fitted.values=mu)
}
diff --git a/R/nbinomDeviance.R b/R/nbinomDeviance.R
index 7e7a04a..5d2357b 100644
--- a/R/nbinomDeviance.R
+++ b/R/nbinomDeviance.R
@@ -3,42 +3,44 @@ nbinomDeviance <- function(y,mean,dispersion=0,weights=NULL)
# y is a matrix and a deviance is computed for each row
# A vector y is taken to be a matrix with one row.
# Original version 23 November 2010.
-# Last modified 03 Oct 2016.
+# Last modified 9 Dec 2013.
{
# Convert vector to row matrix
if(!is.matrix(y)) y <- array(y, c(1L,length(y)), if(!is.null(names(y))) list(NULL,names(y)))
- out <- .compute_nbdeviance(y=y, mean=mean, dispersion=dispersion, weights=weights, dosum=TRUE)
- names(out) <- rownames(y)
- out
+
+ d <- nbinomUnitDeviance(y=y,mean=mean,dispersion=dispersion)
+ if(!is.null(weights)) {
+ weights <- expandAsMatrix(weights, dim(d))
+ d <- weights*d
+ }
+ rowSums(d)
}
-.compute_nbdeviance <- function(y, mean, dispersion, weights, dosum) {
+
+nbinomUnitDeviance <- function(y,mean,dispersion=0)
+# Unit deviance for the nbinom distribution.
+{
# Check y. May be matrix or vector.
- if(!is.matrix(y)) y <- matrix(y)
+ if (!is.double(y)) storage.mode(y) <- "double"
+ ntags <- NROW(y)
nobs <- length(y)
# Check mean
- if(!is.matrix(mean)) mean <- matrix(mean)
- if(!is.double(mean)) storage.mode(mean) <- "double"
+ if (!is.double(mean)) storage.mode(mean) <- "double"
if(length(mean)<nobs) stop("mean should have same dimensions as y")
-# Check dispersion (can be tagwise (rowwise) or observation-wise).
- dispersion <- .compressDispersions(dispersion)
+# Check dispersion.
+# Can be tagwise (rowwise) or observation-wise.
+ if (!is.double(dispersion)) dispersion <- "double"
+ lend <- length(dispersion)
+ if(lend < ntags) dispersion <- rep_len(dispersion, length.out=ntags)
+ if(lend > ntags && lend < nobs) dispersion <- rep_len(dispersion, length.out=nobs)
-# Check weights.
- weights <- .compressWeights(weights)
+ out <- .Call(.cR_compute_nbdev, y=y, mu=mean, phi=dispersion)
-# Computing unit deviance or residual deviance per gene, depending on 'dosum'.
- d <- .Call(.cR_compute_nbdev, y, mean, dispersion, weights, as.logical(dosum))
- if(is.character(d)) stop(d)
+# Check error status
+ if (is.character(out)) stop(out)
- return(d)
-}
-
-nbinomUnitDeviance <- function(y,mean,dispersion=0)
-# Unit deviance for the nbinom distribution.
-{
- out <- .compute_nbdeviance(y=y, mean=mean, dispersion=dispersion, weights=NULL, dosum=FALSE)
- dimnames(out) <- dimnames(y)
- return(out)
+ y[] <- out
+ return(y)
}
diff --git a/R/normalizeChIPtoInput.R b/R/normalizeChIPtoInput.R
index ee8b6a6..5eb5d07 100644
--- a/R/normalizeChIPtoInput.R
+++ b/R/normalizeChIPtoInput.R
@@ -21,9 +21,9 @@ normalizeChIPtoInput <- function(input,response,dispersion=0.01,niter=6,loss="p"
n <- length(response)
# Handle special cases
- if(n==0) return(list(p=numeric(0),scaling.factor=NA,prop.enriched=NA))
- if(all(input==0)) return(list(p=rep(0,1),scaling.factor=0,prop.enriched=1))
- if(n==1) return(list(p=1,scaling.factor=input/response,prop.enriched=0))
+ if(n==0) return(p=numeric(0),scaling.factor=NA,prop.enriched=NA)
+ if(all(input==0)) return(p=rep(0,1),scaling.factor=0,prop.enriched=1)
+ if(n==1) return(p=1,scaling.factor=input/response,prop.enriched=0)
# Reset zero inputs to minimum positive value
input[input==0] <- min(input[input>0])
diff --git a/R/plotMD.R b/R/plotMD.R
index f4fb2a2..dcbc5ac 100644
--- a/R/plotMD.R
+++ b/R/plotMD.R
@@ -39,49 +39,10 @@ plotMD.DGEGLM <- function(object, column=ncol(object), coef=NULL, xlab="Average
plotWithHighlights(x=object$AveLogCPM,y=logFC,xlab=xlab,ylab=ylab,main=main,status=status,...)
}
-plotMD.DGELRT <- function(object, xlab="Average log CPM", ylab="log-fold-change", main=object$comparison, status=object$genes$Status, contrast=1, values=names(table(status)), col=NULL, adjust.method="BH", p.value=0.05, ...)
+plotMD.DGEExact <- plotMD.DGELRT <- function(object, xlab="Average log CPM", ylab="log-fold-change", main=object$comparison, status=object$genes$Status, ...)
# Mean-difference plot with color coding for controls
# Gordon Smyth
-# Created 24 June 2015. Last modified 21 March 2017.
+# Last modified 24 June 2015.
{
- logFC <- object$table$logFC
- FTest <- is.null(logFC)
-
- if(is.null(status)) {
- status <- decideTestsDGE(object, adjust.method=adjust.method, p.value=p.value)
- if(FTest) {
- status <- c("non-DE", "DE")[status+1L]
- values <- "DE"
- col <- "red"
- } else {
- status <- c("Down", "non-DE", "Up")[status+2L]
- values <- c("Up","Down")
- col <- c("red","blue")
- }
- }
-
-# Multiple contrasts
- if(FTest) {
- sel <- grep("^logFC", names(object$table))[contrast]
- if(is.na(sel)) stop("Selected contrast does not exist.")
- logFC <- object$table[, sel]
- contrast.name <- gsub("logFC[.]", "", names(object$table)[sel])
- main <- paste0("Contrast ", contrast.name)
- }
- plotWithHighlights(x=object$table$logCPM,y=logFC,xlab=xlab,ylab=ylab,main=main,status=status,values=values,col=col, ...)
-}
-
-plotMD.DGEExact <- function(object, xlab="Average log CPM", ylab="log-fold-change", main=NULL, status=object$genes$Status, values=names(table(status)), col=NULL, adjust.method="BH", p.value=0.05, ...)
-# Mean-difference plot with color coding for controls
-# Gordon Smyth
-# Created 24 June 2015. Last modified 7 Feb 2017.
-{
- if(is.null(status)) {
- status <- decideTestsDGE(object, adjust.method=adjust.method, p.value=p.value)
- status <- c("Down", "non-DE", "Up")[status+2L]
- values <- c("Up","Down")
- col <- c("red","blue")
- }
- if(is.null(main)) main <- paste(object$comparison[2],"vs",object$comparison[1])
- plotWithHighlights(x=object$table$logCPM,y=object$table$logFC,xlab=xlab,ylab=ylab,main=main,status=status,values=values,col=col,...)
+ plotWithHighlights(x=object$table$logCPM,y=object$table$logFC,xlab=xlab,ylab=ylab,main=main,status=status,...)
}
diff --git a/R/plotMDS.DGEList.R b/R/plotMDS.DGEList.R
index 593727d..85280fb 100644
--- a/R/plotMDS.DGEList.R
+++ b/R/plotMDS.DGEList.R
@@ -1,7 +1,7 @@
-plotMDS.DGEList <- function (x,top=500,labels=NULL,pch=NULL,cex=1,dim.plot=c(1,2),ndim=max(dim.plot),gene.selection="pairwise",xlab=NULL,ylab=NULL,method="logFC",prior.count=2,plot=TRUE,...)
+plotMDS.DGEList <- function (x,top=500,labels=NULL,pch=NULL,cex=1,dim.plot=c(1,2),ndim=max(dim.plot),gene.selection="pairwise",xlab=NULL,ylab=NULL,method="logFC",prior.count=2,...)
# Multidimensional scaling plot of digital gene expression profiles
# Yunshun Chen, Mark Robinson and Gordon Smyth
-# 23 May 2011. Last modified 26 Nov 2016.
+# 23 May 2011. Last modified 25 Nov 2014.
{
method <- match.arg(method, c("logfc","logFC","bcv","BCV"))
if(method=="logfc") method <- "logFC"
@@ -10,7 +10,7 @@ plotMDS.DGEList <- function (x,top=500,labels=NULL,pch=NULL,cex=1,dim.plot=c(1,2
# Default method is to convert to moderated logCPM and call limma plotMDS
if(method=="logFC") {
y <- cpm(x,log=TRUE,prior.count=prior.count)
- return(plotMDS(y,top=top,labels=labels,pch=pch,cex=cex,dim.plot=dim.plot,ndim=ndim,gene.selection=gene.selection,xlab=xlab,ylab=ylab,plot=plot,...))
+ return(plotMDS(y,top=top,labels=labels,pch=pch,cex=cex,dim.plot=dim.plot,ndim=ndim,gene.selection=gene.selection,xlab=xlab,ylab=ylab,...))
}
# From here method="bcv"
@@ -75,8 +75,5 @@ plotMDS.DGEList <- function (x,top=500,labels=NULL,pch=NULL,cex=1,dim.plot=c(1,2
mds$x <- a1[,dim.plot[1]]
mds$y <- a1[,dim.plot[2]]
mds$axislabel <- "BCV distance"
- if(plot)
- plotMDS(mds,labels=labels,pch=pch,cex=cex,xlab=xlab,ylab=ylab,...)
- else
- mds
+ plotMDS(mds,labels=labels,pch=pch,cex=cex,xlab=xlab,ylab=ylab,...)
}
diff --git a/R/predFC.R b/R/predFC.R
index 8744ace..4d7f2ec 100644
--- a/R/predFC.R
+++ b/R/predFC.R
@@ -15,20 +15,42 @@ predFC.DGEList <- function(y,design=NULL,prior.count=0.125,offset=NULL,dispersio
predFC.default <- function(y,design=NULL,prior.count=0.125,offset=NULL,dispersion=0,weights=NULL,...)
# Shrink log-fold-changes towards zero by augmenting data counts
# Gordon Smyth and Belinda Phipson
-# 17 Aug 2011. Last modified 3 Oct 2016.
+# 17 Aug 2011. Last modified 4 Nov 2012.
{
-# Add prior counts in proportion to library sizes
- out <- addPriorCount(y, offset=offset, prior.count=prior.count)
+# Check y
+ y <- as.matrix(y)
+ ngenes <- nrow(y)
+ nsamples <- ncol(y)
+
+
+# Check prior.count
+ if(prior.count<0) stop("prior.count should be non-negative")
+# Check offset
+ if(is.null(offset)) {
+ lib.size <- colSums(y)
+ offset <- log(lib.size)
+ } else
+ lib.size <- exp(offset)
# Check design
if(is.null(design)) {
warning("Behaviour of predFC with design=NULL is scheduled to be deprecated April 2014. Use cpm() instead.",call.=FALSE)
- return(cpm(y,lib.size=exp(offset),log=TRUE,prior.count=prior.count))
+ return(cpm(y,lib.size=lib.size,log=TRUE,prior.count=prior.count))
} else
design <- as.matrix(design)
+# Add prior counts in proportion to library sizes
+ if(is.null(dim(lib.size)))
+ ave.lib.size <- mean(lib.size)
+ else
+ ave.lib.size <- rowMeans(lib.size)
+ prior.count <- prior.count * lib.size/ave.lib.size
+ lib.size <- lib.size+2*prior.count
+ if(is.null(dim(prior.count))) prior.count <- matrix(prior.count,ngenes,nsamples,byrow=TRUE)
+ y <- y+prior.count
+
# Return matrix of coefficients on log2 scale
- g <- glmFit(out$y,design,offset=out$offset,dispersion=dispersion,prior.count=0,weights=weights,...)
- g$coefficients/log(2)
+ g <- glmFit(y,design,offset=log(lib.size),dispersion=dispersion,prior.count=0,weights=weights,...)
+ g$coefficients/log(2)
}
diff --git a/R/readDGE.R b/R/readDGE.R
index 1e5b2b9..dccf58c 100644
--- a/R/readDGE.R
+++ b/R/readDGE.R
@@ -1,6 +1,6 @@
readDGE <- function(files,path=NULL,columns=c(1,2),group=NULL,labels=NULL,...)
# Read and collate a set of count data files, each file containing counts for one library
-# Created 2006. Last modified 29 July 2016.
+# Created 2006. Last modified 15 June 2015.
{
# Create data.frame to hold sample information
x <- list()
@@ -18,16 +18,18 @@ readDGE <- function(files,path=NULL,columns=c(1,2),group=NULL,labels=NULL,...)
# Set group factor
if(!is.null(group)) x$samples$group <- group
- if(is.null(x$samples$group)) x$samples$group <- rep_len(1L,nfiles)
+ if(is.null(x$samples$group)) x$samples$group <- rep(1,nfiles)
x$samples$group <- as.factor(x$samples$group)
-# Read files into a list
+# Read files
d <- taglist <- list()
for (fn in x$samples$files) {
if(!is.null(path)) fn <- file.path(path,fn)
d[[fn]] <- read.delim(fn,...,stringsAsFactors=FALSE)
taglist[[fn]] <- as.character(d[[fn]][,columns[1]])
- if(anyDuplicated(taglist[[fn]])) stop("Repeated tag sequences in",fn)
+ if(anyDuplicated(taglist[[fn]])) {
+ stop(paste("Repeated tag sequences in",fn))
+ }
}
# Collate counts for unique tags
@@ -40,13 +42,13 @@ readDGE <- function(files,path=NULL,columns=c(1,2),group=NULL,labels=NULL,...)
x$counts[aa,i] <- d[[i]][,columns[2]]
}
-# Alert user if htseq style meta genes found
- MetaTags <- grep("^__",tags,value=TRUE)
- if(length(MetaTags)) message("Meta tags detected: ",paste(MetaTags,collapse=", "))
-
# Enter library sizes and norm factors
x$samples$lib.size <- colSums(x$counts)
x$samples$norm.factors <- 1
+# Alert user if htseq style meta genes found
+ MetaTags <- grep("^__",tags,value=TRUE)
+ if(length(MetaTags)) message("Meta tags detected: ",paste(MetaTags,collapse=", "))
+
new("DGEList",x)
}
diff --git a/R/scaleOffset.R b/R/scaleOffset.R
deleted file mode 100644
index 297258f..0000000
--- a/R/scaleOffset.R
+++ /dev/null
@@ -1,37 +0,0 @@
-scaleOffset <- function(y, ...)
-UseMethod("scaleOffset")
-
-scaleOffset.default <- function(y, offset, ...)
-# Ensures scale of offsets are consistent with library sizes.
-# Aaron Lun and Yunshun Chen.
-# Created 8 Dec 2016.
-{
- if(is.matrix(y)) lib.size <- colSums(y)
- else lib.size <- y
-
- if(is.matrix(offset)) {
- if (ncol(offset)!=length(lib.size)) {
- if (! (is(offset, "compressedMatrix") && attributes(offset)$repeat.col) ) {
- stop("'ncol(offset)' should be equal to number of libraries")
- }
- }
- adj <- rowMeans(offset)
- } else {
- if (length(offset)!=length(lib.size)) {
- stop("length of 'offset' should be equal to number of libraries")
- }
- adj <- mean(offset)
- }
-
- mean(log(lib.size)) + offset - adj
-}
-
-scaleOffset.DGEList <- function(y, offset, ...)
-# Ensures scale of offsets are consistent with library sizes.
-# Aaron Lun and Yunshun Chen.
-# Created 8 Dec 2016.
-{
- y$offset <- scaleOffset(y$samples$lib.size, offset)
- y
-}
-
diff --git a/R/spliceVariants.R b/R/spliceVariants.R
index 01e3711..503da15 100644
--- a/R/spliceVariants.R
+++ b/R/spliceVariants.R
@@ -2,21 +2,18 @@ spliceVariants <- function(y, geneID, dispersion=NULL, group=NULL, estimate.gene
# Identify genes with splice variants using a negative binomial model
# We assume that the data come in a matrix (possibly and/or a DGEList), counts summarized at exon level, with gene information available
# Davis McCarthy and Gordon Smyth
-# Created 4 February 2011. Last modified 3 Oct 2016.
+# Created 4 February 2011. Last modified 2 Aug 2011.
{
if( is(y, "DGEList") ) {
y.mat <- y$counts
if( is.null(group) )
group <- y$samples$group
- lib.size <- y$samples$lib.size * y$samples$norm.factors
}
else {
y.mat <- as.matrix(y)
if( is.null(group) )
stop("y is a matrix and no group argument has been supplied. Please supply group argument.")
- lib.size <- colSums(y)
}
-
geneID <- as.vector(unlist(geneID))
## Order genes by geneID: we need some way to reorganise the data---output cannot possibly be same dimension as input so this is a sensible way to organise things
o <- order(geneID)
@@ -84,9 +81,8 @@ spliceVariants <- function(y, geneID, dispersion=NULL, group=NULL, estimate.gene
full.index <- rownames(exons) %in% uniqIDs[this.genes]
if( any(this.genes) ) {
gene.counts.mat <- matrix(t(exons[full.index,]), nrow=sum(this.genes), ncol=ncol(exons)*i.exons, byrow=TRUE)
- expanded.lib.size <- rep(lib.size, i.exons)
if(i.exons==1) {
- abundance[this.genes] <- aveLogCPM(gene.counts.mat, lib.size=expanded.lib.size)
+ abundance[this.genes] <- aveLogCPM(gene.counts.mat)
splicevars.out$LR[this.genes] <- 0
splicevars.out$PValue[this.genes] <- 1
}
@@ -99,7 +95,7 @@ spliceVariants <- function(y, geneID, dispersion=NULL, group=NULL, estimate.gene
coef <- (ncol(X.null)+1):ncol(X.full)
## Fit NB GLMs to these genes
fit.this <- glmFit(gene.counts.mat, X.full, dispersion[this.genes], offset=0, prior.count=0)
- abundance[this.genes] <- aveLogCPM(gene.counts.mat, lib.size=expanded.lib.size)
+ abundance[this.genes] <- aveLogCPM(gene.counts.mat)
results.this <- glmLRT(fit.this, coef=coef)
if(sum(this.genes)==1) {
splicevars.out$LR[this.genes] <- results.this$table$LR[1]
diff --git a/R/topTags.R b/R/topTags.R
index c1a51ca..e2c1228 100644
--- a/R/topTags.R
+++ b/R/topTags.R
@@ -38,8 +38,8 @@ topTags <- function(object,n=10,adjust.method="BH",sort.by="PValue",p.value=1)
tab <- object$table[o,]
# Add adjusted p-values if appropriate
- adj.p.val <- p.adjust(object$table$PValue,method=adjust.method)
if(adjust.method != "none") {
+ adj.p.val <- p.adjust(object$table$PValue,method=adjust.method)
if(adjust.method %in% FWER.methods) adjustment <- "FWER"
if(adjust.method %in% FDR.methods) adjustment <- "FDR"
tab[[adjustment]] <- adj.p.val[o]
diff --git a/build/vignette.rds b/build/vignette.rds
index f35f801..12bd1b9 100644
Binary files a/build/vignette.rds and b/build/vignette.rds differ
diff --git a/inst/CITATION b/inst/CITATION
index e1bc1bf..48c71f3 100644
--- a/inst/CITATION
+++ b/inst/CITATION
@@ -1,4 +1,4 @@
-citHeader("See Section 1.2 in the User's Guide for more detail about how to cite the different edgeR pipelines.")
+citHeader("Please cite the first paper for the software itself and the other papers for the various original statistical methods implemented in edgeR. See Section 1.2 in the User's Guide for more detail.")
citEntry(
entry="article",
@@ -22,3 +22,38 @@ citEntry(
pages = 4288-4297,
textVersion = "McCarthy DJ, Chen Y and Smyth GK (2012). Differential expression analysis of multifactor RNA-Seq experiments with respect to biological variation. Nucleic Acids Research 40, 4288-4297"
)
+
+citEntry(
+ entry="article",
+ title = "Moderated statistical tests for assessing differences in tag abundance",
+ author = "Mark D Robinson and Gordon K Smyth",
+ journal = "Bioinformatics",
+ volume = 23,
+ pages = 2881-2887,
+ year = 2007,
+ textVersion = "Robinson MD and Smyth GK (2007). Moderated statistical tests for assessing differences in tag abundance. Bioinformatics 23, 2881-2887"
+)
+
+citEntry(
+ entry="article",
+ title = "Small-sample estimation of negative binomial dispersion, with applications to SAGE data",
+ author = "Mark D Robinson and Gordon K Smyth",
+ journal = "Biostatistics",
+ volume = 9,
+ pages = 321-332,
+ year = 2008,
+ textVersion = "Robinson MD and Smyth GK (2008). Small-sample estimation of negative binomial dispersion, with applications to SAGE data. Biostatistics, 9, 321-332"
+)
+
+
+citEntry(
+ entry="article",
+ title = "Robustly detecting differential expression in RNA sequencing data using observation weights",
+ author = "Xiaobei Zhou and Helen Lindsay and Mark D Robinson",
+ journal = "Nucleic Acids Research",
+ volume = 42,
+ pages = "e91",
+ year = 2014,
+ textVersion = "Zhou X, Lindsay H, Robinson MD (2014). Robustly detecting differential expression in RNA sequencing data using observation weights. Nucleic Acids Research, 42(11), e91."
+)
+
diff --git a/inst/NEWS.Rd b/inst/NEWS.Rd
index ddb9c58..c8d1272 100644
--- a/inst/NEWS.Rd
+++ b/inst/NEWS.Rd
@@ -2,69 +2,6 @@
\title{edgeR News}
\encoding{UTF-8}
-\section{Version 3.18.0}{\itemize{
-\item
-roast.DGEList(), mroast.DGEList(), fry.DGEList() and camera.DGEList() now have explicit arguments instead of passing arguments with ... to the default method.
-
-\item
-New function scaleOffset() to ensure scale of offsets are consistent with library sizes.
-
-\item
-Added decideTests() S3 methods for DGEExact and DGELRT objects. It now works for F-tests with multiple contrasts.
-
-\item
-Report log-fold changes for redundant contrasts in F-tests with multiple contrasts.
-
-\item
-Modified plotMD() S3 method for DGELRT and DGEExact objects. It now automatically uses decideTests() and highlights the DE genes on the MD plot.
-
-\item
-New argument 'plot' in plotMDS.DGEList().
-
-\item
-Removed S3 length methods for data objects.
-
-\item
-gini() now support NA values and avoids integer overflow.
-}}
-
-
-\section{Version 3.16.0}{\itemize{
-\item
-estimateDisp() now respects weights in calculating the APLs.
-
-\item
-Added design matrix to the output of estimateDisp().
-
-\item
-glmFit() constructs design matrix, if design=NULL, from y$samples$group.
-
-\item
-New argument 'null' in glmTreat(), and a change in how p-values are calculated by default.
-
-\item
-Modified the default 'main' in plotMD().
-
-\item
-Created a new S3 class, compressedMatrix, to store offsets and weights efficiently.
-
-\item
-Added the makeCompressedMatrix() function to make a compressedMatrix object.
-
-\item
-Switched storage of offsets in DGEGLM objects to use the compressedMatrix class.
-
-\item
-Added the addPriorCount() function for adding prior counts.
-
-\item
-Modified spliceVariants() calculation of the average log-CPM.
-
-\item
-Migrated some internal calculations and checks to C++ for greater efficiency.
-}}
-
-
\section{Version 3.14.0}{\itemize{
\item
estimateDisp(), estimateCommonDisp(), estimateTrendedDisp(), estimateTagwiseDisp(), splitIntoGroups() and equalizeLibSizes() are now S3 generic functions.
@@ -73,9 +10,6 @@ estimateDisp(), estimateCommonDisp(), estimateTrendedDisp(), estimateTagwiseDisp
The default method of estimateGLMTrendedDisp() and estimateGLMTagwiseDisp() now only return dispersion estimates instead of a list.
\item
-The DGEList method of estimateDisp(), estimateCommonDisp() and estimateGLMCommonDisp() now use the common dispersion estimate to compute AveLogCPM and store it in the output.
-
-\item
Add fry method for DGEList objects.
\item
diff --git a/inst/doc/edgeR.pdf b/inst/doc/edgeR.pdf
index 14a9302..8ae117d 100644
Binary files a/inst/doc/edgeR.pdf and b/inst/doc/edgeR.pdf differ
diff --git a/man/addPriorCount.Rd b/man/addPriorCount.Rd
deleted file mode 100644
index 1248ba3..0000000
--- a/man/addPriorCount.Rd
+++ /dev/null
@@ -1,61 +0,0 @@
-\name{addPriorCount}
-\alias{addPriorCount}
-
-\title{Add a prior count}
-\description{Add a library size-adjusted prior count to each observation.}
-
-\usage{
-addPriorCount(y, lib.size=NULL, offset=NULL, prior.count=1)
-}
-
-\arguments{
-\item{y}{a numeric count matrix, with rows corresponding to genes and columns to libraries.}
-\item{lib.size}{a numeric vector of library sizes.}
-\item{offset}{a numeric vector or matrix of offsets.}
-\item{prior.count}{a numeric scalar or vector of prior counts to be added to each gene.}
-}
-
-\details{
-This function adds a positive prior count to each observation, often useful for avoiding zeroes during calculation of log-values.
-For example, \code{\link{predFC}} will call this function to calculate shrunken log-fold changes.
-\code{\link{aveLogCPM}} and \code{\link{cpm}} also use the same underlying code to calculate (average) log-counts per million.
-
-The actual value added to the counts for each library is scaled according to the library size.
-This ensures that the relative contribution of the prior is the same for each library.
-Otherwise, a fixed prior would have little effect on a large library, but a big effect for a small library.
-
-The library sizes are also modified, with twice the scaled prior being added to the library size for each library.
-To understand the motivation for this, consider that each observation is, effectively, a proportion of the total count in the library.
-The addition scheme implemented here represents an empirical logistic transform and ensures that the proportion can never be zero or one.
-
-If \code{offset} is supplied, this is used in favour of \code{lib.size} where \code{exp(offset)} is defined as the vector/matrix of library sizes.
-If an offset matrix is supplied, this will lead to gene-specific scaling of the prior as described above.
-
-Most use cases of this function will involve supplying a constant value to \code{prior.count} for all genes.
-However, it is also possible to use gene-specific values by supplying a vector of length equal to the number of rows in \code{y}.
-}
-
-\value{
-A list is returned containing \code{y}, a matrix of counts with the added priors; and \code{offset}, a compressedMatrix containing the (log-transformed) modified library sizes.
-}
-
-\examples{
-original <- matrix(rnbinom(1000, mu=20, size=10), nrow=200)
-head(original)
-
-out <- addPriorCount(original)
-head(out$y)
-head(out$offset)
-}
-
-\author{
-Aaron Lun
-}
-
-\seealso{
-\code{\link{aveLogCPM}},
-\code{\link{cpm}},
-\code{\link{predFC}}
-}
-
-
diff --git a/man/aveLogCPM.Rd b/man/aveLogCPM.Rd
index 4750aa8..e9c6da4 100644
--- a/man/aveLogCPM.Rd
+++ b/man/aveLogCPM.Rd
@@ -30,7 +30,7 @@ Compute average log2 counts-per-million for each row of counts.
\details{
This function uses \code{mglmOneGroup} to compute average counts-per-million (AveCPM) for each row of counts, and returns log2(AveCPM).
An average value of \code{prior.count} is added to the counts before running \code{mglmOneGroup}.
-If \code{prior.count} is a vector, each entry will be added to all counts in the corresponding row of \code{y}, as described in \code{\link{addPriorCount}}.
+If \code{prior.count} is a vector, each entry will be added to all counts in the corresponding row of \code{y}.
This function is similar to
@@ -67,7 +67,5 @@ aveLogCPM(y, dispersion=runif(nrow(y), 0, 0.2))
\seealso{
See \code{\link{cpm}} for individual logCPM values, rather than genewise averages.
-Addition of the prior count is performed using the strategy described in \code{\link{addPriorCount}}.
-
The computations for \code{aveLogCPM} are done by \code{\link{mglmOneGroup}}.
}
diff --git a/man/camera.DGEList.Rd b/man/camera.DGEList.Rd
index 1f9bc25..bb8725b 100644
--- a/man/camera.DGEList.Rd
+++ b/man/camera.DGEList.Rd
@@ -4,23 +4,16 @@
\description{
Test whether a set of genes is highly ranked relative to other genes in terms of differential expression, accounting for inter-gene correlation.
}
-
\usage{
-\method{camera}{DGEList}(y, index, design, contrast = ncol(design), weights = NULL,
- use.ranks = FALSE, allow.neg.cor=FALSE, inter.gene.cor=0.01, sort = TRUE, \dots)
+\method{camera}{DGEList}(y, index, design=NULL, contrast=ncol(design), \dots)
}
\arguments{
\item{y}{a \code{DGEList} object containing dispersion estimates.}
- \item{index}{an index vector or a list of index vectors. Can be any vector such that \code{y[index,]} selects the rows corresponding to the test set. The list can be made using \code{\link{ids2indices}}.}
- \item{design}{design matrix.}
- \item{contrast}{contrast of the linear model coefficients for which the test is required. Can be an integer specifying a column of \code{design}, or else a numeric vector of same length as the number of columns of \code{design}.}
- \item{weights}{numeric matrix of observation weights of same size as \code{y}, or a numeric vector of array weights with length equal to \code{ncol(y)}, or a numeric vector of gene weights with length equal to \code{nrow(y)}.}
- \item{use.ranks}{do a rank-based test (\code{TRUE}) or a parametric test (\code{FALSE})?}
- \item{allow.neg.cor}{should reduced variance inflation factors be allowed for negative correlations?}
- \item{inter.gene.cor}{numeric, optional preset value for the inter-gene correlation within tested sets. If \code{NA} or \code{NULL}, then an inter-gene correlation will be estimated for each tested set.}
- \item{sort}{logical, should the results be sorted by p-value?}
- \item{\dots}{other arguments are not currently used}
+ \item{index}{an index vector or a list of index vectors. Can be any vector such that \code{y[indices,]} selects the rows corresponding to the test set.}
+ \item{design}{the design matrix.}
+ \item{contrast}{the contrast of the linear model coefficients for which the test is required. Can be an integer specifying a column of \code{design}, or the name of a column of \code{design}, or else a numeric vector of same length as the number of columns of \code{design}.}
+ \item{\dots}{other arguments are passed to \code{\link{camera.default}}.}
}
\details{
diff --git a/man/decidetestsDGE.Rd b/man/decidetestsDGE.Rd
index 3415812..06ebc3e 100644
--- a/man/decidetestsDGE.Rd
+++ b/man/decidetestsDGE.Rd
@@ -1,63 +1,37 @@
-\name{decideTests}
+\name{decideTestsDGE}
\alias{decideTestsDGE}
-\alias{decideTests.DGEExact}
-\alias{decideTests.DGELRT}
\title{Multiple Testing Across Genes and Contrasts}
\description{
-Identify which genes are significantly differentially expressed from an edgeR fit object containing p-values and test statistics.
+Classify a series of related differential expression statistics as up, down or not significant.
+A number of different multiple testing schemes are offered which adjust for multiple testing down the genes as well as across contrasts for each gene.
}
\usage{
decideTestsDGE(object, adjust.method="BH", p.value=0.05, lfc=0)
-\method{decideTests}{DGELRT}(object, adjust.method="BH", p.value=0.05, lfc=0, \dots)
}
\arguments{
- \item{object}{\code{DGEExact}, \code{DGELRT} or \code{glmQLFTest} object from which p-values and log-fold-changes can be extracted.}
- \item{adjust.method}{character string specifying p-value adjustment method.
- Possible values are \code{"none"}, \code{"BH"}, \code{"fdr"} (equivalent to \code{"BH"}), \code{"BY"} and \code{"holm"}.
- See \code{\link[stats]{p.adjust}} for details.}
- \item{p.value}{numeric value between 0 and 1 giving the required family-wise error rate or false discovery rate.}
- \item{lfc}{numeric, minimum absolute log2-fold-change required.}
- \item{\dots}{other arguments are not used.}
+ \item{object}{\code{DGEExact} object, output from \code{exactTest},
+ or \code{DGELRT} object, output from \code{glmLRT} or \code{glmQLFTest}, from which
+ p-values for differential expression and log-fold change values may be extracted.}
+
+ \item{adjust.method}{character string specifying p-value adjustment method. Possible values are \code{"none"}, \code{"BH"}, \code{"fdr"} (equivalent to \code{"BH"}), \code{"BY"} and \code{"holm"}. See \code{\link[stats]{p.adjust}} for details.}
+
+ \item{p.value}{numeric value between 0 and 1 giving the desired size of the test}
+ \item{lfc}{numeric value giving the desired absolute minimum log-fold-change}
}
\value{
-An object of class \code{\link[limma:TestResults]{TestResults}}.
-This is essentially a single-column integer matrix with elements \code{-1}, \code{0}
-or \code{1} indicating whether each gene is classified as
-significantly down-regulated, not significant or
-significant up-regulated for the comparison contained in \code{object}.
-To be considered significant, genes have to have adjusted p-value below \code{p.value} and log2-fold-change greater than \code{lfc}.
-
-If \code{object} contains F-tests or LRTs for multiple contrasts, then the genes are simply classified as significant (1) or not significant.
-In this case, the log2-fold-change theshold \code{lfc} has to be achieved by at least one of the contrastsf or a gene to be significant.
+An object of class \code{TestResults} (see \code{\link[limma:TestResults]{TestResults}}).
+This is essentially a numeric matrix with elements \code{-1}, \code{0}
+or \code{1} depending on whether each DE p-value is classified as
+significant with negative log-fold change, not significant or
+significant with positive log-fold change, respectively.
}
\details{
-This function applies a multiple testing procedure and significance level cutoff to the genewise tests contained in \code{object}.
-}
-\note{
-Although this function enables users to set p-value and lfc cutoffs simultaneously, this combination criterion not usually recommended.
-Unless the fold changes and p-values are very highly correlated, the addition of a fold change cutoff can increase the family-wise error rate or false discovery rate above the nominal level.
-Users wanting to use fold change thresholding should considering using \code{glmTreat} instead and leaving \code{lfc} at the default value when using \code{decideTestsDGE}.
+These functions implement multiple testing procedures for determining
+whether each log-fold change in a matrix of log-fold changes should be considered significantly different from zero.
+
}
\seealso{
-\code{\link[limma:decideTests]{decideTests}} and \code{\link[limma:TestResults]{TestResults}} in the limma package.
-}
-\author{Davis McCarthy, Gordon Smyth and the edgeR team}
-\examples{
-ngenes <- 100
-x1 <- rnorm(6)
-x2 <- rnorm(6)
-design <- cbind(Intercept=1,x1,x2)
-beta <- matrix(0,ngenes,3)
-beta[,1] <- 4
-beta[1:20,2] <- rnorm(20)
-mu <- 2^(beta \%*\% t(design))
-y <- matrix(rnbinom(ngenes*6,mu=mu,size=10),ngenes,6)
-fit <- glmFit(y,design,dispersion=0.1)
-lrt <- glmLRT(fit,coef=2:3)
-res <- decideTests(lrt,p.value=0.1)
-summary(res)
-lrt <- glmLRT(fit,coef=2)
-res <- decideTests(lrt,p.value=0.1)
-summary(res)
+Adapted from \code{\link[limma:decideTests]{decideTests}} in the limma package.
}
+\author{Davis McCarthy, Gordon Smyth}
\keyword{htest}
diff --git a/man/dim.Rd b/man/dim.Rd
index 06c4271..f03f31a 100644
--- a/man/dim.Rd
+++ b/man/dim.Rd
@@ -4,12 +4,18 @@
\alias{dim.TopTags}
\alias{dim.DGEGLM}
\alias{dim.DGELRT}
+\alias{length.DGEList}
+\alias{length.DGEExact}
+\alias{length.TopTags}
+\alias{length.DGEGLM}
+\alias{length.DGELRT}
\title{Retrieve the Dimensions of a DGEList, DGEExact, DGEGLM, DGELRT or TopTags Object}
\description{
Retrieve the number of rows (genes) and columns (libraries) for an DGEList, DGEExact or TopTags Object.
}
\usage{
\method{dim}{DGEList}(x)
+\method{length}{DGEList}(x)
}
\arguments{
\item{x}{an object of class \code{DGEList}, \code{DGEExact}, \code{TopTags}, \code{DGEGLM} or \code{DGELRT}}
@@ -38,5 +44,6 @@ MA <- new("MAList",list(M=M,A=A))
dim(M)
ncol(M)
nrow(M)
+length(M)
}
\keyword{array}
diff --git a/man/edgeR-package.Rd b/man/edgeR-package.Rd
index 2b9d4ea..fa6d6ce 100644
--- a/man/edgeR-package.Rd
+++ b/man/edgeR-package.Rd
@@ -5,7 +5,6 @@
\title{Empirical analysis of digital gene expression data in R}
\description{
edgeR is a package for the analysis of digital gene expression data arising from RNA sequencing technologies such as SAGE, CAGE, Tag-seq or RNA-seq, with emphasis on testing for differential expression.
-It can also be used for other sequencing technologies from which read counts are produced, such as ChIP-seq, Hi-C or CRISPR.
Particular strengths of the package include the ability to estimate biological variation between replicate libraries, and to conduct exact tests of significance which are suitable for small counts.
The package is able to make use of even minimal numbers of replicates.
@@ -39,26 +38,8 @@ Mark Robinson <mrobinson at wehi.edu.au>, Davis McCarthy <dmccarthy at wehi.edu.au>, Y
Differential expression analysis of multifactor RNA-Seq experiments with respect to biological variation.
\emph{Nucleic Acids Research} 40, 4288-4297.
-Chen, Y, Lun, ATL, and Smyth, GK (2014).
-Differential expression analysis of complex RNA-seq experiments using edgeR.
-In: \emph{Statistical Analysis of Next Generation Sequence Data},
-Somnath Datta and Daniel S Nettleton (eds), Springer, New York, pages 51-74.
-\url{http://www.statsci.org/smyth/pubs/edgeRChapterPreprint.pdf}
-
-Dai Z, Sheridan, JM, Gearing, LJ, Moore, DL, Su, S, Wormald, S, Wilcox, S, O'Connor, L, Dickins, RA, Blewitt, ME, Ritchie, ME (2014).
-edgeR: a versatile tool for the analysis of shRNA-seq and CRISPR-Cas9 genetic screens.
-\emph{F1000Research} 3, 95.
-\url{http://f1000research.com/articles/3-95}
-
-Lun, ATL, Chen, Y, and Smyth, GK (2016).
-It's DE-licious: a recipe for differential expression analyses of RNA-seq experiments using quasi-likelihood methods in edgeR.
-\emph{Methods in Molecular Biology} 1418, 391-416.
-\url{http://www.statsci.org/smyth/pubs/QLedgeRPreprint.pdf">Preprint 8 April 2015}
-
-Chen Y, Lun ATL, and Smyth, GK (2016).
-From reads to genes to pathways: differential expression analysis of RNA-Seq experiments using Rsubread and the edgeR quasi-likelihood pipeline.
-\emph{F1000Research} 5, 1438.
-\url{http://f1000research.com/articles/5-1438}
+ Lund, SP, Nettleton, D, McCarthy, DJ, Smyth, GK (2012).
+ Detecting differential expression in RNA-sequence data using quasi-likelihood with shrunken dispersion estimates.
+ \emph{Statistical Applications in Genetics and Molecular Biology}. (Accepted 31 July 2012)
}
-
\keyword{package}
diff --git a/man/estimateDisp.Rd b/man/estimateDisp.Rd
index 5c1b4d4..eb35f09 100644
--- a/man/estimateDisp.Rd
+++ b/man/estimateDisp.Rd
@@ -41,7 +41,6 @@ Maximizes the negative binomial likelihood to give the estimate of the common, t
\value{
\code{estimateDisp.DGEList} adds the following components to the input \code{DGEList} object:
- \item{design}{the design matrix.}
\item{common.dispersion}{estimate of the common dispersion.}
\item{trended.dispersion}{estimates of the trended dispersions.}
\item{tagwise.dispersion}{tagwise estimates of the dispersion parameter if \code{tagwise=TRUE}.}
@@ -68,20 +67,20 @@ The \code{estimateDisp} function doesn't give exactly the same estimates as the
Chen, Y, Lun, ATL, and Smyth, GK (2014).
Differential expression analysis of complex RNA-seq experiments using edgeR.
In: \emph{Statistical Analysis of Next Generation Sequence Data},
-Somnath Datta and Daniel S. Nettleton (eds), Springer, New York, pages 51-74.
+Somnath Datta and Daniel S Nettleton (eds), Springer, New York.
\url{http://www.statsci.org/smyth/pubs/edgeRChapterPreprint.pdf}
Phipson, B, Lee, S, Majewski, IJ, Alexander, WS, and Smyth, GK (2016).
Robust hyperparameter estimation protects against hypervariable genes and improves power to detect differential expression.
-\emph{Annals of Applied Statistics} 10, 946-963.
-\url{http://projecteuclid.org/euclid.aoas/1469199900}
+\emph{Annals of Applied Statistics} 10.
+\url{http://arxiv.org/abs/1602.08678}
}
\author{Yunshun Chen, Gordon Smyth}
\examples{
# True dispersion is 1/5=0.2
y <- matrix(rnbinom(1000, mu=10, size=5), ncol=4)
-group <- factor(c(1,1,2,2))
+group <- c(1,1,2,2)
design <- model.matrix(~group)
d <- DGEList(counts=y, group=group)
d1 <- estimateDisp(d)
diff --git a/man/expandAsMatrix.Rd b/man/expandAsMatrix.Rd
index 3494470..c91560d 100644
--- a/man/expandAsMatrix.Rd
+++ b/man/expandAsMatrix.Rd
@@ -8,25 +8,14 @@ Expand scalar or vector to a matrix.
expandAsMatrix(x, dim=NULL, byrow=TRUE)
}
\arguments{
- \item{x}{scalar, vector, matrix or compressedMatrix.}
- \item{dim}{integer vector of length 2 specifying the required dimensions of the output matrix.}
- \item{byrow}{logical scalar specifying if matrix should be filled by columns or rows for a vector \code{x}.}
-
+ \item{x}{scalar, vector or matrix. If a vector, length must match one of the output dimensions.}
+ \item{dim}{required dimension for the output matrix.}
+ \item{byrow}{logical. Should the matrix be filled by columns or by rows (the default) if the length of the input vector is equal to both dimensions?}
}
\details{
-This function expands a scalar, row/column vector or compressedMatrix to be a matrix of dimensions \code{dim}.
-It is used internally in edgeR to convert offsets, weights and other values to a matrix for consistent handling.
-If \code{dim} is \code{NULL}, the function is equivalent to calling \code{as.matrix(x)}.
-
-If \code{x} is a vector, its length must match one of the output dimensions.
-The matrix will then be filled by repeating the matrix across the matching dimension.
-For example, if \code{length(x)==dim[1]}, the matrix will be filled such that each row contains \code{x}.
-If both dimensions match, filling is determined by \code{byrow}, with filling by rows as the default.
-
-If \code{x} compressedMatrix object, the size of any non-repeated dimensions must be consistent with corresponding output dimension.
-The \code{byrow} argument will be ignored as the repeat specifications will dictate how expansion should be performed.
-See \code{?\link{compressedMatrix}} for more details.
+This function expands a row or column vector to be a matrix.
+It is used internally in edgeR to convert offsets to a matrix.
}
\value{
diff --git a/man/gini.Rd b/man/gini.Rd
index d5d158a..f2d7506 100644
--- a/man/gini.Rd
+++ b/man/gini.Rd
@@ -8,7 +8,7 @@ Gini index for each column of a matrix.
gini(x)
}
\arguments{
- \item{x}{a non-negative numeric matrix, or an object that can be coerced to such a matrix by \code{as.matrix}.}
+ \item{x}{non-negative numeric matrix}
}
\details{
The Gini coefficient or index is a measure of inequality or diversity.
diff --git a/man/glmQLFTest.Rd b/man/glmQLFTest.Rd
index 3b9ac6f..ccc45ff 100644
--- a/man/glmQLFTest.Rd
+++ b/man/glmQLFTest.Rd
@@ -13,7 +13,7 @@ Conduct genewise statistical tests for a given coefficient or contrast.}
\method{glmQLFit}{DGEList}(y, design=NULL, dispersion=NULL, offset=NULL, abundance.trend=TRUE,
robust=FALSE, winsor.tail.p=c(0.05, 0.1), \dots)
\method{glmQLFit}{default}(y, design=NULL, dispersion=NULL, offset=NULL, lib.size=NULL,
- weights=NULL, abundance.trend=TRUE, AveLogCPM=NULL, robust=FALSE,
+ abundance.trend=TRUE, AveLogCPM=NULL, robust=FALSE,
winsor.tail.p=c(0.05, 0.1), \dots)
glmQLFTest(glmfit, coef=ncol(glmfit$design), contrast=NULL, poisson.bound=TRUE)
}
@@ -29,8 +29,6 @@ glmQLFTest(glmfit, coef=ncol(glmfit$design), contrast=NULL, poisson.bound=TRUE)
\item{lib.size}{numeric vector of length \code{ncol(y)} giving library sizes. Only used if \code{offset=NULL}, in which case \code{offset} is set to \code{log(lib.size)}. Defaults to \code{colSums(y)}.}
-\item{weights}{numeric matrix of same size as \code{y} giving weights for the log-linear models. If \code{NULL}, will be set to unity for all observations.}
-
\item{abundance.trend}{logical, whether to allow an abundance-dependent trend when estimating the prior values for the quasi-likelihood multiplicative dispersion parameter.}
\item{AveLogCPM}{average log2-counts per million, the average taken over all libraries in \code{y}. If \code{NULL} will be computed by \code{aveLogCPM(y)}.}
@@ -86,14 +84,9 @@ It also stores \code{df.total}, a numeric vector containing the denominator degr
}
\references{
-Chen Y, Lun ATL, and Smyth, GK (2016).
-From reads to genes to pathways: differential expression analysis of RNA-Seq experiments using Rsubread and the edgeR quasi-likelihood pipeline.
-\emph{F1000Research} 5, 1438.
-\url{http://f1000research.com/articles/5-1438}
-
-Lun, ATL, Chen, Y, and Smyth, GK (2016).
+Lun, ATL, Chen, Y, and Smyth, GK (2015).
It's DE-licious: a recipe for differential expression analyses of RNA-seq experiments using quasi-likelihood methods in edgeR.
-\emph{Methods in Molecular Biology} 1418, 391-416.
+Bioinformatics Division, Walter and Eliza Hall Institute of Medical Research, Melbourne, Australia.
\url{http://www.statsci.org/smyth/pubs/QLedgeRPreprint.pdf">Preprint 8 April 2015}
Lund, SP, Nettleton, D, McCarthy, DJ, and Smyth, GK (2012).
@@ -101,10 +94,10 @@ Detecting differential expression in RNA-sequence data using quasi-likelihood wi
\emph{Statistical Applications in Genetics and Molecular Biology} Volume 11, Issue 5, Article 8.
\url{http://www.statsci.org/smyth/pubs/QuasiSeqPreprint.pdf}
-Phipson, B, Lee, S, Majewski, IJ, Alexander, WS, and Smyth, GK (2016).
-Robust hyperparameter estimation protects against hypervariable genes and improves power to detect differential expression.
-\emph{Annals of Applied Statistics} 10, 946-963.
-\url{http://arxiv.org/abs/1602.08678}
+Phipson, B, Lee, S, Majewski, IJ, Alexander, WS, and Smyth, GK (2013).
+Empirical Bayes in the presence of exceptional cases, with application to microarray data.
+Bioinformatics Division, Walter and Eliza Hall Institute of Medical Research, Melbourne, Australia.
+\url{http://www.statsci.org/smyth/pubs/RobustEBayesPreprint.pdf}
}
\author{Yunshun Chen, Aaron Lun, Davis McCarthy and Gordon Smyth}
diff --git a/man/glmTreat.Rd b/man/glmTreat.Rd
index 42d8b8d..e709163 100644
--- a/man/glmTreat.Rd
+++ b/man/glmTreat.Rd
@@ -1,12 +1,14 @@
\name{glmTreat}
\alias{glmTreat}
+\alias{treatDGE}
\title{Test for Differential Expression Relative to a Threshold}
\description{Conduct genewise statistical tests for a given coefficient or contrast relative to a specified fold-change threshold.}
\usage{
-glmTreat(glmfit, coef = ncol(glmfit$design), contrast = NULL, lfc = 0, null = "interval")
+glmTreat(glmfit, coef=ncol(glmfit$design), contrast=NULL, lfc=0)
+treatDGE(glmfit, coef=ncol(glmfit$design), contrast=NULL, lfc=0)
}
\arguments{
@@ -17,11 +19,6 @@ glmTreat(glmfit, coef = ncol(glmfit$design), contrast = NULL, lfc = 0, null = "i
\item{contrast}{numeric vector specifying the contrast of the linear model coefficients to be tested against the log2-fold-change threshold. Length must equal to the number of columns of \code{design}. If specified, then takes precedence over \code{coef}.}
\item{lfc}{numeric scalar specifying the absolute value of the log2-fold change threshold above which differential expression is to be considered.}
-
-\item{null}{character string, choices are \code{"worst.case"} or \code{"interval"}.
-If \code{"worst.case"}, then the null hypothesis asssumes that the true logFC is on the boundary of the possible values, either at \code{lfc} or \code{-lfc}, whichever gives the largest p-value.
-This gives the most conservative results.
-If \code{"interval"}, then the null hypotheses assumes the true logFC to belong to a bounded interval of possible values.}
}
\value{
@@ -51,23 +48,14 @@ If \code{lfc=0}, then \code{glmTreat} is equivalent to \code{glmLRT} or \code{gl
If there is no shrinkage on log-fold-changes, i.e., fitting glms with \code{prior.count=0}, then \code{unshrunk.logFC} and \code{logFC} are essentially the same. Hence they are merged into one column of \code{logFC} in \code{table}.
Note that \code{glmTreat} constructs test statistics using \code{unshrunk.logFC} rather than \code{logFC}.
-\code{glmTreat} with positive \code{lfc} gives larger p-values than would be obtained with \code{lfc=0}.
-If \code{null="worst.case"}, then \code{glmTreat} conducts a test closely analogous to the \code{treat} function in the limma package.
-This conducts a test if which the null hypothesis puts the true logFC on the boundary of the \code{[-lfc,lfc]} interval closest to the observed logFC.
-If \code{null="interval"}, then the null hypotheses assumes an interval of possible values for the true logFC.
-This approach is somewhat less conservative.
-}
-
-\note{
\code{glmTreat} was previously called \code{treatDGE}.
+The old function name is now deprecated and will be removed in a future release of edgeR.
}
\author{Yunshun Chen and Gordon Smyth}
\seealso{
\code{\link{topTags}} displays results from \code{glmTreat}.
-
-\code{\link[limma]{treat}} is the corresponding function in the limma package, designed on normally distributed expression data rather than negative binomial counts.
}
\references{
diff --git a/man/goodTuring.Rd b/man/goodTuring.Rd
index bb7c21b..20dd2ef 100644
--- a/man/goodTuring.Rd
+++ b/man/goodTuring.Rd
@@ -37,7 +37,7 @@ The posterior means are then converted to proportions.
In the empirical Bayes step, the counts are smoothed by assuming a log-linear relationship between frequencies and frequencies of frequencies.
The fundamentals of the algorithm are from Good (1953).
Gale and Sampson (1995) proposed a simplied algorithm with a rule for switching between the observed and smoothed frequencies, and it is Gale and Sampson's simplified algorithm that is implemented here.
-The number of zero values in \code{x} is not used as part of the algorithm, but is returned by this function.
+The number of zero values in \code{x} are not used in the algorithm, but is returned by this function.
Sampson gives a C code version on his webpage at
\url{http://www.grsampson.net/RGoodTur.html}
@@ -53,10 +53,6 @@ uses the results to predict the proportion of each gene in each library.
Gale, WA, and Sampson, G (1995).
Good-Turing frequency estimation without tears.
\emph{Journal of Quantitative Linguistics} 2, 217-237.
-
-Good, IJ (1953).
-The population frequencies of species and the estimation of population parameters.
-\emph{Biometrika} 40, 237-264.
}
\author{Aaron Lun and Gordon Smyth, adapted from Sampson's C code from \url{http://www.grsampson.net/RGoodTur.html}}
diff --git a/man/makeCompressedMatrix.Rd b/man/makeCompressedMatrix.Rd
deleted file mode 100644
index 604820a..0000000
--- a/man/makeCompressedMatrix.Rd
+++ /dev/null
@@ -1,108 +0,0 @@
-\title{makeCompressedMatrix}
-\name{makeCompressedMatrix}
-\alias{makeCompressedMatrix}
-\alias{compressedMatrix}
-\alias{[.compressedMatrix}
-\alias{as.matrix.compressedMatrix}
-
-\description{
-Construct a compressedMatrix object from a scalar, vector or matrix.
-}
-
-\usage{
-makeCompressedMatrix(x, byrow=TRUE)
-
-\method{[}{compressedMatrix}(x, i, j, ...)
-\method{as.matrix}{compressedMatrix}(x, ...)
-}
-
-\arguments{
- \item{x}{For \code{makeCompressedMatrix}, a scalar, vector, matrix or compressedMatrix object.
- For the S3 methods, a compressedMatrix object.}
- \item{byrow}{logical. If \code{x} is a vector, should it be repeated across rows (default) or across columns?}
- \item{i, j}{subset indices to apply to \code{x}.}
- \item{...}{additional arguments, ignored.}
-}
-
-\details{
-This function creates a compressedMatrix object from \code{x}.
-The compressedMatrix class inherits from a matrix and holds two logical scalar attributes \code{repeat.row} and \code{repeat.col}.
-Each attribute specifies whether the values are to be repeated across rows and/or across columns.
-This avoids the need to store redundant values in a full-sized matrix of dimensions \code{dim}, as would be done with \code{\link{expandAsMatrix}}.
-
-To illustrate, consider that rows usually correspond to genes while columns usually correspond to libraries.
-If we have a vector of library sizes, this will hold one unique value per library that is the same for all genes.
-Thus, we should use \code{byrow=TRUE}, which will construct a compressedMatrix object storing one row containing this vector.
-Here, \code{repeat.row=TRUE} and \code{repeat.col=FALSE}, indicating that the row is to be repeated for all genes.
-
-On the other hand, we may have a vector of gene-specific values that is the same for all libraries (e.g., dispersions).
-In this case, we should use \code{byrow=FALSE} to construct the compressedMatrix object.
-This will store one column with \code{repeat.row=FALSE} and \code{repeat.col=TRUE}, indicating that the column should be repeated across libraries.
-
-In cases where \code{x} is a scalar, \code{byrow} is ignored and both \code{repeat.row} and \code{repeat.col} will be \code{TRUE} by default.
-If \code{x} is a matrix, both attributes will be \code{FALSE}.
-If \code{x} is a compressedMatrix, it will be returned without modification.
-
-Subsetting of a compressedMatrix object depends on the values of \code{repeat.row} and \code{repeat.col}.
-If the rows are repeated, any subsetting by row will be ignored.
-Similarly, if the columns are repeated, any subsetting by column will be ignored.
-This reflects the fact that the repeated dimension has no fixed size, so subsetting on it is meaningless.
-If neither are repeated, subsetting behaves as it would for a normal matrix.
-
-Calling \code{as.matrix} will return the raw matrix without attributes or classes.
-If either the columns or rows are repeated, the corresponding dimension in the returned matrix will be of length 1.
-Otherwise, it will be of arbitrary length depending on the size/length of \code{x} used originally to construct \code{y}.
-A compressedMatrix object can also be used as input to \code{\link{expandAsMatrix}}, which will expand it to the specified dimensions.
-
-The compressedMatrix is used throughout edgeR to save space in storing offsets and (to a lesser extent) weights.
-This is because, for routine analyses, offsets are the same for all genes so it makes little sense to expand it to the full dimensions of the count matrix.
-Most functions will accept a compressedMatrix as input to \code{offset} or \code{weights} arguments.
-}
-
-\value{
-A object of class compressedMatrix, containing \code{x} and the additional attributes \code{repeat.row} and \code{repeat.col}.
-}
-
-\author{Aaron Lun}
-
-\examples{
-# Repeated rows:
-library.sizes <- runif(4, 1e6, 2e6)
-lib.mat <- makeCompressedMatrix(library.sizes, byrow=TRUE)
-lib.mat
-
-lib.mat[,1:2] # subset by column works as expected
-lib.mat[1:10,] # subset by row has no effect (see Details)
-as.matrix(lib.mat)
-expandAsMatrix(lib.mat, dim=c(10, 4))
-
-# Repeated columns:
-gene.disp <- runif(10, 0.01, 0.1)
-disp.mat <- makeCompressedMatrix(gene.disp, byrow=FALSE)
-disp.mat
-
-disp.mat[,1:2] # subset by column has no effect
-disp.mat[1:5,] # subset by row works as expected
-as.matrix(disp.mat)
-expandAsMatrix(disp.mat, dim=c(10, 4), byrow=FALSE)
-
-# Scalar:
-weights <- makeCompressedMatrix(1)
-weights[1:10,] # subsetting has no effect
-weights[,1:10]
-as.matrix(weights)
-expandAsMatrix(weights, dim=c(10, 4))
-
-# Matrix:
-offsets <- makeCompressedMatrix(matrix(runif(40), 10, 4))
-offsets[1:5,]
-offsets[,1:2]
-as.matrix(offsets)
-expandAsMatrix(offsets, dim=c(10, 4))
-}
-
-\seealso{
-\code{\link{expandAsMatrix}}
-}
-
-\keyword{hplot}
diff --git a/man/nbinomDeviance.Rd b/man/nbinomDeviance.Rd
index 66d6a5b..58367f1 100644
--- a/man/nbinomDeviance.Rd
+++ b/man/nbinomDeviance.Rd
@@ -38,9 +38,9 @@ Care is taken to ensure accurate computation for small dispersion values.
}
\references{
-Jorgensen, B. (2013).
-Generalized linear models. Encyclopedia of Environmetrics 3, Wiley.
-\url{http://onlinelibrary.wiley.com/doi/10.1002/9780470057339.vag010.pub2/abstract}.
+Jorgensen, B. (2006).
+Generalized linear models. Encyclopedia of Environmetrics, Wiley.
+\url{http://onlinelibrary.wiley.com/doi/10.1002/9780470057339.vag010/full}.
McCarthy, DJ, Chen, Y, Smyth, GK (2012).
Differential expression analysis of multifactor RNA-Seq experiments with respect to biological variation.
diff --git a/man/plotMD.Rd b/man/plotMD.Rd
index c491762..c6c189d 100644
--- a/man/plotMD.Rd
+++ b/man/plotMD.Rd
@@ -10,19 +10,16 @@ Creates a mean-difference plot (aka MA plot) with color coding for highlighted p
}
\usage{
-\method{plotMD}{DGEList}(object, column=1, xlab="Average log CPM (this sample and others)",
- ylab="log-ratio (this sample vs others)",
- main=colnames(object)[column], status=object$genes$Status,
- zero.weights=FALSE, prior.count=3, \dots)
-\method{plotMD}{DGEGLM}(object, column=ncol(object), coef=NULL, xlab="Average log CPM",
- ylab="log-fold-change", main=colnames(object)[column],
- status=object$genes$Status, zero.weights=FALSE, \dots)
-\method{plotMD}{DGELRT}(object, xlab="Average log CPM", ylab="log-fold-change",
- main=object$comparison, status=object$genes$Status, contrast=1,
- values=names(table(status)), col=NULL, adjust.method="BH", p.value=0.05, \dots)
-\method{plotMD}{DGEExact}(object, xlab="Average log CPM", ylab="log-fold-change",
- main=NULL, status=object$genes$Status, values=names(table(status)),
- col=NULL, adjust.method="BH", p.value=0.05, \dots)
+\method{plotMD}{DGEList}(object, column = 1, xlab = "Average log CPM (this sample and others)",
+ ylab = "log-ratio (this sample vs others)",
+ main = colnames(object)[column], status=object$genes$Status,
+ zero.weights = FALSE, prior.count = 3, \dots)
+\method{plotMD}{DGEGLM}(object, column = ncol(object), coef = NULL, xlab = "Average log CPM",
+ ylab = "log-fold-change", main = colnames(object)[column],
+ status=object$genes$Status, zero.weights = FALSE, \dots)
+\method{plotMD}{DGELRT}(object, xlab = "Average log CPM",
+ ylab = "log-fold-change", main = object$comparison,
+ status=object$genes$Status, \dots)
}
\arguments{
@@ -33,15 +30,9 @@ Creates a mean-difference plot (aka MA plot) with color coding for highlighted p
\item{ylab}{character string, label for y-axis}
\item{main}{character string, title for plot}
\item{status}{vector giving the control status of each spot on the array, of same length as the number of rows of \code{object}.
- If \code{NULL} under the \code{DGEList} or \code{DGEGLM} method, then all points are plotted in the default color, symbol and size.
- If \code{NULL} under the \code{DGELRT} or \code{DGEExact} method, then \code{\link{decideTestsDGE}} is run to determine the status of all the genes. The up-regulated DE genes are highlighted in red and down-regulated in blue.}
+ If \code{NULL}, then all points are plotted in the default color, symbol and size.}
\item{zero.weights}{logical, should spots with zero or negative weights be plotted?}
\item{prior.count}{the average prior count to be added to each observation. Larger values produce more shrinkage.}
- \item{contrast}{integer specifying which log-fold-change to be plotted in the case of testing multiple contrasts. Only used for the \code{DGELRT} method with multiple contrasts.}
- \item{values}{character vector giving values of \code{status} to be highlighted on the plot. Defaults to unique values of \code{status}.}
- \item{col}{vector of colors for highlighted points, either of unit length or of same length as \code{values}.}
- \item{adjust.method}{character string passed to \code{\link{decideTestsDGE}} specifying p-value adjustment method. Only used when \code{status} is \code{NULL}. See \code{\link{decideTestsDGE}} for details.}
- \item{p.value}{numeric value between 0 and 1 giving the desired size of the test. Only used and passed to \code{\link{decideTestsDGE}} when \code{status} is \code{NULL}.}
\item{\dots}{other arguments are passed to \code{\link{plotWithHighlights}}.}
}
diff --git a/man/plotMDS.DGEList.Rd b/man/plotMDS.DGEList.Rd
index 41acd87..f6c6644 100644
--- a/man/plotMDS.DGEList.Rd
+++ b/man/plotMDS.DGEList.Rd
@@ -6,8 +6,9 @@ Plot samples on a two-dimensional scatterplot so that distances on the plot appr
}
\usage{
\method{plotMDS}{DGEList}(x, top = 500, labels = NULL, pch = NULL, cex = 1,
- dim.plot = c(1,2), ndim = max(dim.plot), gene.selection = "pairwise",
- xlab = NULL, ylab = NULL, method = "logFC", prior.count = 2, plot = TRUE, \dots)
+ dim.plot = c(1,2), ndim = max(dim.plot), gene.selection = "pairwise",
+ xlab = NULL, ylab = NULL, method = "logFC", prior.count = 2,
+ \dots)
}
\arguments{
\item{x}{a \code{DGEList} object.}
@@ -22,7 +23,6 @@ Plot samples on a two-dimensional scatterplot so that distances on the plot appr
\item{ylab}{y-axis label}
\item{method}{method used to compute distances. Possible values are "logFC" or \code{"bcv"}.}
\item{prior.count}{average prior count to be added to observation to shrink the estimated log-fold-changes towards zero. Only used when \code{method="logFC"}.}
- \item{plot}{logical. If \code{TRUE} then a plot is created on the current graphics device.}
\item{\dots}{any other arguments are passed to \code{plot}.}
}
@@ -45,7 +45,7 @@ Note that the \code{"bcv"} method is slower than the \code{"logFC"} method when
}
\value{
-An object of class \code{\link[limma:plotMDS]{MDS}} is invisibly returned and (if \code{plot=TRUE}) a plot is created on the current graphics device.
+An object of class \code{\link[limma:plotMDS]{MDS}} is invisibly returned and a plot is created on the current graphics device.
}
\author{Yunshun Chen, Mark Robinson and Gordon Smyth}
diff --git a/man/plotQLDisp.Rd b/man/plotQLDisp.Rd
index df4ac9e..0d78dff 100644
--- a/man/plotQLDisp.Rd
+++ b/man/plotQLDisp.Rd
@@ -14,7 +14,7 @@ plotQLDisp(glmfit, xlab="Average Log2 CPM", ylab="Quarter-Root Mean Deviance", p
\item{ylab}{label for the y-axis.}
\item{pch}{the plotting symbol. See \code{\link{points}} for more details.}
\item{cex}{plot symbol expansion factor. See \code{\link{points}} for more details.}
- \item{col.shrunk}{color of the points representing the squeezed quasi-likelihood dispersions.}
+ \item{col.shrunk}{color of the points representing the shrunk quasi-liklihood dispersions.}
\item{col.trend}{color of line showing dispersion trend.}
\item{col.raw}{color of points showing the unshrunk dispersions.}
\item{\dots}{any other arguments are passed to \code{plot}.}
@@ -30,7 +30,7 @@ The quarter-root transformation is applied to improve visibility for dispersions
A plot is created on the current graphics device.
}
-\author{Aaron Lun, Davis McCarthy, Gordon Smyth, Yunshun Chen.}
+\author{Aaron Lun, based on code by Davis McCarthy and Gordon Smyth}
\examples{
nbdisp <- 1/rchisq(1000, df=10)
@@ -45,11 +45,4 @@ fit <- glmQLFit(y, design, abundance.trend=FALSE)
plotQLDisp(fit)
}
-\references{
-Chen Y, Lun ATL, and Smyth, GK (2016).
-From reads to genes to pathways: differential expression analysis of RNA-Seq experiments using Rsubread and the edgeR quasi-likelihood pipeline.
-\emph{F1000Research} 5, 1438.
-\url{http://f1000research.com/articles/5-1438}
-}
-
\keyword{plot}
diff --git a/man/plotSmear.Rd b/man/plotSmear.Rd
index 381c695..dd8faa5 100644
--- a/man/plotSmear.Rd
+++ b/man/plotSmear.Rd
@@ -1,12 +1,11 @@
\name{plotSmear}
\alias{plotSmear}
\title{
-Smear plot
+Plots log-Fold Change versus log-Concentration (or, M versus A) for Count Data
}
\description{
-Make a mean-difference plot of two libraries of count data with smearing of points with very low counts, especially those that are zero for one of the columns.
+Both of these functions plot the log-fold change (i.e. the log of the ratio of expression levels for each gene between two experimential groups) against the log-concentration (i.e. the overall average expression level for each gene across the two groups). To represent counts that were low (e.g. zero in 1 library and non-zero in the other) in one of the two conditions, a 'smear' of points at low A value is presented in \code{plotSmear}.
}
-
\usage{
plotSmear(object, pair=NULL, de.tags=NULL, xlab="Average logCPM", ylab="logFC", pch=19,
cex=0.2, smearWidth=0.5, panel.first=grid(), smooth.scatter=FALSE, lowess=FALSE, \dots)
@@ -26,22 +25,16 @@ plotSmear(object, pair=NULL, de.tags=NULL, xlab="Average logCPM", ylab="logFC",
\item{\dots}{further arguments passed on to \code{plot}}
}
-\value{
-Invisibly returns the x and y coordinates of the plotted points, and a plot is created on the current device.
-}
+\value{A plot to the current device}
\details{
-\code{plotSmear} produces a type of mean-difference plot (or MA plot) with a special representation (smearing) of log-ratios that are infinite.
-\code{plotSmear} resolves the problem of plotting genes that have a total count of zero for one of the groups by adding the 'smear' of points at low A value.
-The points to be smeared are identified as being equal to the minimum estimated concentration in one of the two groups.
-The smear is created by using random uniform numbers of width \code{smearWidth} to the left of the minimum A.
-\code{plotSmear} also allows easy highlighting of differentially expressed (DE) genes.
+\code{plotSmear} is a more sophisticated and superior way to produce an 'MA plot'. \code{plotSmear} resolves the problem of plotting genes that have a total count of zero for one of the groups by adding the 'smear' of points at low A value. The points to be smeared are identified as being equal to the minimum estimated concentration in one of the two groups. The smear is created by using random uniform numbers of width \code{smearWidth} to the left of the minimum A. \code{plotSmear} als [...]
}
\author{Mark Robinson, Davis McCarthy}
\seealso{
-\code{\link{maPlot}}, \code{\link{plotMD.DGEList}}
+\code{\link{maPlot}}
}
\examples{
diff --git a/man/predFC.Rd b/man/predFC.Rd
index 43858e2..c447854 100644
--- a/man/predFC.Rd
+++ b/man/predFC.Rd
@@ -29,11 +29,11 @@
}
\details{
-This function computes predictive log-fold changes (pfc) for a NB GLM.
+This function computes predictive log-fold changes (pfc) for a NB glm.
The pfc are posterior Bayesian estimators of the true log-fold-changes.
They are predictive of values that might be replicated in a future experiment.
-Specifically, the function adds a small prior count to each observation before fitting the GLM (see \code{\link{addPriorCount}} for details).
+Specifically the function adds a small prior count to each observation before estimating the glm.
The actual prior count that is added is proportion to the library size.
This has the effect that any log-fold-change that was zero prior to augmentation remains zero and non-zero log-fold-changes are shrunk towards zero.
@@ -78,7 +78,6 @@ abline(a=0,b=1)
}
\seealso{
-\code{\link{glmFit}}, \code{\link{exactTest}},
-\code{\link{addPriorCount}}
+\code{\link{glmFit}}, \code{\link{exactTest}}
}
diff --git a/man/processAmplicons.Rd b/man/processAmplicons.Rd
index c8643d8..7bf7d17 100644
--- a/man/processAmplicons.Rd
+++ b/man/processAmplicons.Rd
@@ -48,41 +48,21 @@ processAmplicons(readfile, readfile2=NULL, barcodefile, hairpinfile,
\item{lib.size}{auto-calculated column sum of the counts matrix}
}
-\details{
-The \code{processAmplicons} function assumes the sequences in your fastq files have a fixed structure (as per Figure 1A of Dai et al, 2014).
+\details{The \code{processAmplicons} function assumes the sequences in your fastq files have a fixed structure (as per Figure 1A of http://f1000research.com/articles/3-95/v2). It cannot be used if your hairpins/sgRNAs/sample index sequences are in random locations within each read. You will need to customise your own sequence processing pipeline if this is the case, but can still complete your downstream analysis using edgeR.
-The input barcode file and hairpin/sgRNA files are tab-separated text files with at least two columns (named 'ID' and 'Sequences') containing the sample or hairpin/sgRNA ids and a second column indicating the sample index or hairpin/sgRNA sequences to be matched.
-If \code{barcode2Start} and \code{barcode2End} are specified, a third column 'Sequences2' is expected in the barcode file.
-If \code{readfile2}, \code{barcodeStartRev} and \code{barcodeEndRev} are specified, another column 'SequencesReverse' is expected in the barcode file.
-The barcode file may also contain a 'group' column that indicates which experimental group a sample belongs to.
-Additional columns in each file will be included in the respective \code{$samples} or \code{$genes} data.frames of the final code{\link[edgeR:DGEList-class]{DGEList}} object.
-These files, along with the fastq file/(s) are assumed to be in the current working directory.
+The input barcode file and hairpin/sgRNA files are tab-separated text files with at least two columns (named 'ID' and 'Sequences') containing the sample or hairpin/sgRNA ids and a second column indicating the sample index or hairpin/sgRNA sequences to be matched. If \code{barcode2Start} and \code{barcode2End} are specified, a third column 'Sequences2' is expected in the barcode file. If \code{readfile2}, \code{barcodeStartRev} and \code{barcodeEndRev} are specified, another column 'Seque [...]
-To compute the count matrix, matching to the given barcodes and hairpins/sgRNAs is conducted in two rounds.
-The first round looks for an exact sequence match for the given barcode sequences and hairpin/sgRNA sequences at the locations specified.
-If \code{allowShifting} is set to \code{TRUE}, the program also checks if a given hairpin/sgRNA sequence can be found at a neighbouring position in the read.
-If a match isn't found, the program performs a second round of matching which allows for sequence mismatches if \code{allowMismatch} is set to \code{TRUE}.
-The program also checks parameter \code{allowShiftedMismatch} which accommodates mismatches at the shifted positions.
-The maximum number of mismatch bases in barcode and hairpin/sgRNA are specified by the parameters \code{barcodeMismatchBase} and \code{hairpinMismatchBase}.
+To compute the count matrix, matching to the given barcodes and hairpins/sgRNAs is conducted in two rounds. The first round looks for an exact sequence match for the given barcode sequences and hairpin/sgRNA sequences at the locations specified. If \code{allowShifting} is set to \code{TRUE}, the program also checks if a given hairpin/sgRNA sequence can be found at a neighbouring position in the read. If a match isn't found, the program performs a second round of matching which allows for [...]
-The program outputs a \code{\link[edgeR:DGEList-class]{DGEList}} object, with a count matrix indicating the number of times each barcode and hairpin/sgRNA combination could be matched in reads from input fastq file(s).
+The program outputs a \code{\link[edgeR:DGEList-class]{DGEList}} object, with a count matrix indicating the number of times each barcode and hairpin/sgRNA combination could be matched in reads from input fastq file/(s).
-For further examples and data, refer to the case studies available from \url{http://bioinf.wehi.edu.au/shRNAseq}.
-}
-
-\note{
-This function replaced the earlier function \code{processHairpinReads} in edgeR 3.7.17.
-
-This function cannot be used if the hairpins/sgRNAs/sample index sequences are in random locations within each read.
-If that is the case, then analysts will need to customise their own sequence processing pipeline, although edgeR can still be used for downstream analysis.
+For further examples and data, refer to the Case studies available from http://bioinf.wehi.edu.au/shRNAseq/.
+Note: The \code{processAmplicons} function supercedes the earlier \code{processHairpinReads} function.
}
\author{Zhiyin Dai and Matthew Ritchie}
\references{
-Dai Z, Sheridan JM, Gearing, LJ, Moore, DL, Su, S, Wormald, S, Wilcox, S, O'Connor, L, Dickins, RA, Blewitt, ME, Ritchie, ME(2014).
-edgeR: a versatile tool for the analysis of shRNA-seq and CRISPR-Cas9 genetic screens.
-\emph{F1000Research} 3, 95.
-\url{http://f1000research.com/articles/3-95}
+Dai Z, Sheridan JM, et al. (2014). edgeR: a versatile tool for the analysis of shRNA-seq and CRISPR-Cas9 genetic screens. F1000Research, 3:95. http://f1000research.com/articles/3-95/v2. PMID: 24860646.
}
diff --git a/man/roast.DGEList.Rd b/man/roast.DGEList.Rd
index 377a939..1aaf419 100644
--- a/man/roast.DGEList.Rd
+++ b/man/roast.DGEList.Rd
@@ -8,37 +8,17 @@ Rotation gene set testing for Negative Binomial generalized linear models.
}
\usage{
-\method{roast}{DGEList}(y, index = NULL, design = NULL, contrast = ncol(design), geneid = NULL,
- set.statistic = "mean", gene.weights = NULL, \dots)
-
-\method{mroast}{DGEList}(y, index = NULL, design = NULL, contrast = ncol(design), geneid = NULL,
- set.statistic = "mean", gene.weights = NULL,
- adjust.method = "BH", midp = TRUE, sort = "directional", ...)
-
-\method{fry}{DGEList}(y, index = NULL, design = NULL, contrast = ncol(design), geneid = NULL,
- sort = "directional", ...)
+\method{roast}{DGEList}(y, index=NULL, design=NULL, contrast=ncol(design), \dots)
+\method{mroast}{DGEList}(y, index=NULL, design=NULL, contrast=ncol(design), \dots)
+\method{fry}{DGEList}(y, index=NULL, design=NULL, contrast=ncol(design), \dots)
}
\arguments{
\item{y}{\code{DGEList} object.}
- \item{index}{index vector specifying which rows (probes) of \code{y} are in the test set.
- Can be a vector of integer indices, or a logical vector of length \code{nrow(y)}, or a vector of gene IDs corresponding to entries in \code{geneid}.
- Alternatively it can be a data.frame with the first column containing the index vector and the second column containing directional gene weights.
- For \code{mroast} or \code{fry}, \code{index} is a list of index vectors or a list of data.frames. }
+ \item{index}{index vector specifying which rows (genes) of \code{y} are in the test set. This can be a vector of indices, or a logical vector of the same length as \code{statistics}, or any vector such as \code{y[iset,]} contains the values for the gene set to be tested. Defaults to all genes. For \code{mroast} a list of index vectors.}
\item{design}{design matrix}
- \item{contrast}{contrast for which the test is required.
- Can be an integer specifying a column of \code{design}, or the name of a column of \code{design}, or a numeric contrast vector of length equal to the number of columns of \code{design}.}
- \item{geneid}{gene identifiers corresponding to the rows of \code{y}.
- Can be either a vector of length \code{nrow(y)} or the name of the column of \code{y$genes} containing the gene identifiers.
- Defaults to \code{rownames(y)}.}
- \item{set.statistic}{summary set statistic. Possibilities are \code{"mean"},\code{"floormean"},\code{"mean50"} or \code{"msq"}.}
- \item{gene.weights}{numeric vector of directional (positive or negative) genewise weights.
- For \code{mroast} or \code{fry}, this vector must have length equal to \code{nrow(y)}.
- For \code{roast}, can be of length \code{nrow(y)} or of length equal to the number of genes in the test set.}
- \item{adjust.method}{method used to adjust the p-values for multiple testing. See \code{\link{p.adjust}} for possible values.}
- \item{midp}{logical, should mid-p-values be used in instead of ordinary p-values when adjusting for multiple testing?}
- \item{sort}{character, whether to sort output table by directional p-value (\code{"directional"}), non-directional p-value (\code{"mixed"}), or not at all (\code{"none"}).}
- \item{\dots}{other arguments are currently ignored.}
+ \item{contrast}{contrast for which the test is required. Can be an integer specifying a column of \code{design}, or the name of a column of \code{design}, or else a contrast vector of length equal to the number of columns of \code{design}.}
+ \item{\dots}{other arguments are passed to \code{\link{roast.default}} or \code{\link{mroast.default}}.}
}
\value{
diff --git a/man/scaleOffset.Rd b/man/scaleOffset.Rd
deleted file mode 100644
index 2b6d339..0000000
--- a/man/scaleOffset.Rd
+++ /dev/null
@@ -1,35 +0,0 @@
-\name{scaleOffset}
-\alias{scaleOffset}
-\alias{scaleOffset.DGEList}
-\alias{scaleOffset.default}
-
-\title{Scale offsets}
-
-\description{Ensures scale of offsets are consistent with library sizes.}
-
-\usage{
-\method{scaleOffset}{DGEList}(y, offset, \dots)
-\method{scaleOffset}{default}(y, offset, \dots)
-}
-\arguments{
-\item{y}{numeric vector or matrix of counts, or a \code{DGEList} object.}
-\item{offset}{numeric vector or matrix of offsets to be scaled.}
-\item{\dots}{other arguments that are not currently used.}
-}
-
-\value{numeric vector or matrix of scaled offsets.}
-
-\details{
-\code{scaleOffset} ensures that the scale of offsets are consistent with library sizes.
-This is done by ensuring that the mean offset for each gene is the same as the mean log-library size.
-The length or dimensions of \code{offset} should be consistent with the number of libraries in \code{y}.
-}
-
-\author{Aaron Lun, Yunshun Chen}
-
-\examples{
-y <- matrix(rnbinom(40,size=1,mu=100),10,4)
-offset <- rnorm(4)
-scaleOffset(y, offset)
-}
-
diff --git a/man/topTags.Rd b/man/topTags.Rd
index 66c9b74..f1bbd0e 100755
--- a/man/topTags.Rd
+++ b/man/topTags.Rd
@@ -27,7 +27,7 @@ topTags(object, n=10, adjust.method="BH", sort.by="PValue", p.value=1)
an object of class \code{TopTags} containing the following elements for the top \code{n} most differentially expressed tags as determined by \code{sort.by}:
-\item{table}{a data frame containing the elements \code{logFC}, the log-abundance ratio, i.e. fold change, for each tag in the two groups being compared, \code{logCPM}, the log-average concentration/abundance for each tag in the two groups being compared, \code{PValue}, exact p-value for differential expression using the NB model. When \code{adjust.method} is not \code{"none"}, there is an extra column of \code{FDR} showing the adjusted p-value if \code{adjust.method} is one of the \code [...]
+\item{table}{a data frame containing the elements \code{logFC}, the log-abundance ratio, i.e. fold change, for each tag in the two groups being compared, \code{logCPM}, the log-average concentration/abundance for each tag in the two groups being compared, \code{PValue}, exact p-value for differential expression using the NB model, \code{FDR}, the p-value adjusted for multiple testing as found using \code{p.adjust} using the method specified.}
\item{adjust.method}{character string stating the method used to adjust p-values for multiple testing.}
\item{comparison}{a vector giving the names of the two groups being compared.}
\item{test}{character string stating the name of the test.}
diff --git a/src/R_add_prior_count.cpp b/src/R_add_prior_count.cpp
deleted file mode 100644
index cb0a53f..0000000
--- a/src/R_add_prior_count.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-#include "matvec_check.h"
-#include "utils.h"
-#include "add_prior.h"
-
-/**** Adding a prior count to each observation. *******/
-
-SEXP R_add_prior_count(SEXP y, SEXP offset, SEXP prior) try {
- count_holder counts(y);
- const int num_tags=counts.get_ntags();
- const int num_libs=counts.get_nlibs();
- double* count_ptr=(double*)R_alloc(num_libs, sizeof(double));
-
- add_prior AP(num_tags, num_libs, prior, offset, true, true);
- const double* const out_prior=AP.get_priors();
- const double* const out_off=AP.get_offsets();
- const bool priors_are_the_same=AP.same_across_rows();
-
- SEXP output=PROTECT(allocVector(VECSXP, 2));
- try {
- SET_VECTOR_ELT(output, 0, allocMatrix(REALSXP, num_tags, num_libs));
- double* outptr=REAL(VECTOR_ELT(output, 0));
-
- double* libptr;
- if (priors_are_the_same) {
- // Just doing this once to save time, if they are all the same.
- AP.fill_and_next();
- SET_VECTOR_ELT(output, 1, allocVector(REALSXP, num_libs));
- libptr=REAL(VECTOR_ELT(output, 1));
- std::copy(out_off, out_off+num_libs, libptr);
- } else {
- SET_VECTOR_ELT(output, 1, allocMatrix(REALSXP, num_tags, num_libs));
- libptr=REAL(VECTOR_ELT(output, 1));
- }
-
- // Adding a library size-adjusted prior to each count.
- int lib;
- for (int tag=0; tag<num_tags; ++tag) {
- counts.fill_and_next(count_ptr);
-
- if (!priors_are_the_same) {
- // Repeating with the next set of priors/offsets, and storing the new offsets.
- AP.fill_and_next();
- for (lib=0; lib<num_libs; ++lib) {
- libptr[lib*num_tags]=out_off[lib];
- }
- ++libptr;
- }
-
- for (lib=0; lib<num_libs; ++lib) {
- outptr[lib*num_tags]=count_ptr[lib] + out_prior[lib];
- }
- ++outptr;
- }
- } catch (std::exception& e) {
- UNPROTECT(1);
- throw;
- }
-
- UNPROTECT(1);
- return output;
-} catch (std::exception& e) {
- return mkString(e.what());
-}
-
diff --git a/src/R_add_repeat_matrices.cpp b/src/R_add_repeat_matrices.cpp
deleted file mode 100644
index 8e6c589..0000000
--- a/src/R_add_repeat_matrices.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-#include "matvec_check.h"
-#include "utils.h"
-
-SEXP R_add_repeat_matrices(SEXP x, SEXP y, SEXP nr, SEXP nc) try {
- if (!isInteger(nr) || LENGTH(nr)!=1) { throw std::runtime_error("number of rows should be an integer scalar"); }
- const int nrows=asInteger(nr);
- if (!isInteger(nc) || LENGTH(nc)!=1) { throw std::runtime_error("number of columns should be an integer scalar"); }
- const int ncols=asInteger(nc);
-
- matvec_check allx(x, nrows, ncols);
- const double* const xptr2=allx.access();
- matvec_check ally(y, nrows, ncols);
- const double* const yptr2=ally.access();
-
- SEXP output=PROTECT(allocVector(VECSXP, 3));
- try {
- SET_VECTOR_ELT(output, 0, allocMatrix(REALSXP, nrows, ncols));
- double* optr=REAL(VECTOR_ELT(output, 0));
- int tag, lib;
-
- for (tag=0; tag<nrows; ++tag) {
- for (lib=0; lib<ncols; ++lib) {
- optr[lib*nrows] = xptr2[lib] + yptr2[lib];
- }
- ++optr;
- allx.advance();
- ally.advance();
- }
-
- SET_VECTOR_ELT(output, 1, ScalarLogical(allx.is_row_repeated() & ally.is_row_repeated()));
- SET_VECTOR_ELT(output, 2, ScalarLogical(allx.is_col_repeated() & ally.is_col_repeated()));
- } catch (std::exception& e) {
- UNPROTECT(1);
- throw;
- }
-
- UNPROTECT(1);
- return output;
-} catch (std::exception& e) {
- return mkString(e.what());
-}
diff --git a/src/R_ave_log_cpm.cpp b/src/R_ave_log_cpm.cpp
deleted file mode 100644
index 08df056..0000000
--- a/src/R_ave_log_cpm.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-#include "glm.h"
-#include "add_prior.h"
-#include "matvec_check.h"
-
-SEXP R_ave_log_cpm(SEXP y, SEXP offset, SEXP prior, SEXP disp, SEXP weights, SEXP max_iterations, SEXP tolerance) try {
- count_holder counts(y);
- const int num_tags=counts.get_ntags();
- const int num_libs=counts.get_nlibs();
- double* count_ptr=(double*)R_alloc(num_libs, sizeof(double));
-
- add_prior AP(num_tags, num_libs, prior, offset, true, true);
- const double* const out_prior=AP.get_priors();
- const double* const out_off=AP.get_offsets();
- const bool priors_are_the_same=AP.same_across_rows();
-
- // Other CompressedMatrix stuff.
- matvec_check alld(disp, num_tags, num_libs);
- const double* const dptr2=alld.access();
- matvec_check allw(weights, num_tags, num_libs);
- const double* const wptr2=allw.access();
-
- // GLM fitting specifications
- const int maxit=asInteger(max_iterations);
- const double tol=asReal(tolerance);
-
- SEXP output=PROTECT(allocVector(REALSXP, num_tags));
- try {
- double* optr=REAL(output);
- if (priors_are_the_same) {
- AP.fill_and_next();
- }
-
- int lib;
- for (int tag=0; tag<num_tags; ++tag) {
- counts.fill_and_next(count_ptr);
- if (!priors_are_the_same) {
- AP.fill_and_next();
- }
-
- // Adding the current set of priors.
- for (lib=0; lib<num_libs; ++lib) {
- count_ptr[lib]+=out_prior[lib];
- }
-
- // Fitting a one-way layout.
- std::pair<double,bool> fit=glm_one_group(num_libs, maxit, tol, out_off, wptr2, count_ptr, dptr2, NA_REAL);
- optr[tag]=(fit.first + LNmillion)/LNtwo;
-
- allw.advance();
- alld.advance();
- }
-
- } catch (std::exception& e) {
- UNPROTECT(1);
- throw;
- }
-
- UNPROTECT(1);
- return output;
-} catch (std::exception& e) {
- return mkString(e.what());
-}
diff --git a/src/R_calculate_cpm.cpp b/src/R_calculate_cpm.cpp
deleted file mode 100644
index 7d95f4e..0000000
--- a/src/R_calculate_cpm.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-#include "matvec_check.h"
-#include "utils.h"
-#include "add_prior.h"
-
-/**** Calculating the CPMs in cpm.default with log=TRUE, but more memory-efficiently. *******/
-
-SEXP R_calculate_cpm_log (SEXP y, SEXP libsize, SEXP prior) try {
- count_holder counts(y);
- const int num_tags=counts.get_ntags();
- const int num_libs=counts.get_nlibs();
- double* count_ptr=(double*)R_alloc(num_libs, sizeof(double));
-
- add_prior AP(num_tags, num_libs, prior, libsize, false, true);
- const double* const out_prior=AP.get_priors();
- const double* const out_off=AP.get_offsets();
- const bool priors_are_the_same=AP.same_across_rows();
-
- SEXP output=PROTECT(allocMatrix(REALSXP, num_tags, num_libs));
- try {
- double* outptr=REAL(output);
- if (priors_are_the_same) { // Just doing this once to save time, if they're all the same.
- AP.fill_and_next();
- }
-
- // Adding a library size-adjusted prior to each count.
- int lib;
- for (int tag=0; tag<num_tags; ++tag) {
- counts.fill_and_next(count_ptr);
-
- if (!priors_are_the_same) { // Repeating with the next set of priors/offsets.
- AP.fill_and_next();
- }
-
- for (lib=0; lib<num_libs; ++lib) {
- double& curval=outptr[lib*num_tags];
- curval=count_ptr[lib] + out_prior[lib];
- curval=std::log(curval)-out_off[lib]+LNmillion;
- curval/=LNtwo;
- }
- ++outptr;
- }
- } catch (std::exception& e) {
- UNPROTECT(1);
- throw;
- }
- UNPROTECT(1);
- return output;
-} catch (std::exception& e) {
- return mkString(e.what());
-}
-
-/**** Calculating the CPMs in cpm.default with log=FALSE, but more memory-efficiently. *******/
-
-SEXP R_calculate_cpm_raw (SEXP y, SEXP libsize) try {
- count_holder counts(y);
- const int num_tags=counts.get_ntags();
- const int num_libs=counts.get_nlibs();
- double* count_ptr=(double*)R_alloc(num_libs, sizeof(double));
- matvec_check allL(libsize, num_tags, num_libs);
- const double* const lptr2=allL.access();
-
- SEXP output=PROTECT(allocMatrix(REALSXP, num_tags, num_libs));
- try {
- double* outptr=REAL(output);
- int lib;
- for (int tag=0; tag<num_tags; ++tag) {
- counts.fill_and_next(count_ptr);
-
- for (lib=0; lib<num_libs; ++lib) {
- const double& curlib=lptr2[lib];
- outptr[lib*num_tags]=(curlib==0 ? R_NaN : count_ptr[lib]/(curlib/one_million));
- }
- ++outptr;
- allL.advance();
- }
- } catch (std::exception& e) {
- UNPROTECT(1);
- throw;
- }
- UNPROTECT(1);
- return output;
-} catch (std::exception& e) {
- return mkString(e.what());
-}
-
-
-
diff --git a/src/R_check_counts.cpp b/src/R_check_counts.cpp
deleted file mode 100644
index 7dac4bc..0000000
--- a/src/R_check_counts.cpp
+++ /dev/null
@@ -1,137 +0,0 @@
-#include "matvec_check.h"
-#include "utils.h"
-
-bool isNA(int x) {
- return x==NA_INTEGER;
-}
-
-bool isNA(double x) {
- return !R_FINITE(x);
-}
-
-template <typename T>
-SEXP check_counts (const T* ptr, const int len) {
- int allzero=1;
- for (int i=0; i<len; ++i) {
- const T& curval=ptr[i];
- if (isNA(curval)) {
- throw std::runtime_error("missing values not supported");
- }
- if (curval<0) {
- throw std::runtime_error("negative counts not supported");
- }
- if (curval!=0) {
- allzero=0;
- }
- }
- return ScalarLogical(allzero);
-}
-
-SEXP R_check_counts(SEXP y) try {
- count_holder counts(y);
- const int total_size=counts.get_ntags()*counts.get_nlibs();
- if (counts.is_data_integer()) {
- return check_counts<int>(counts.get_raw_int(), total_size);
- } else {
- return check_counts<double>(counts.get_raw_double(), total_size);
- }
-} catch (std::exception& e) {
- return mkString(e.what());
-}
-
-SEXP R_check_finite (SEXP values, SEXP name) try {
- if (!isReal(values)) { throw std::runtime_error("should be double-precision"); }
- const int nobs=LENGTH(values);
- const double* vptr=REAL(values);
- for (int o=0; o<nobs; ++o) {
- const double& curval=vptr[o];
- if (!R_FINITE(curval)) {
- throw std::runtime_error("should be finite values");
- }
- }
- return ScalarLogical(1);
-} catch (std::exception& e) {
- if (!isString(name) || LENGTH(name)!=1) { throw std::runtime_error("value-type name should be a string"); }
- std::stringstream final;
- final << CHAR(STRING_ELT(name, 0)) << " " << e.what();
- return mkString(final.str().c_str());
-}
-
-SEXP R_check_positive (SEXP values, SEXP name) try {
- if (!isReal(values)) { throw std::runtime_error("should be double-precision"); }
- const int nobs=LENGTH(values);
- const double* vptr=REAL(values);
- for (int o=0; o<nobs; ++o) {
- const double& curval=vptr[o];
- if (!R_FINITE(curval) || curval <= 0) {
- throw std::runtime_error("should be finite positive values");
- }
- }
- return ScalarLogical(1);
-} catch (std::exception& e) {
- if (!isString(name) || LENGTH(name)!=1) { throw std::runtime_error("value-type name should be a string"); }
- std::stringstream final;
- final << CHAR(STRING_ELT(name, 0)) << " " << e.what();
- return mkString(final.str().c_str());
-}
-
-SEXP R_check_nonnegative (SEXP values, SEXP name) try {
- if (!isReal(values)) { throw std::runtime_error("should be double-precision"); }
- const int nobs=LENGTH(values);
- const double* vptr=REAL(values);
- for (int o=0; o<nobs; ++o) {
- const double& curval=vptr[o];
- if (!R_FINITE(curval) || curval < 0) {
- throw std::runtime_error("should be finite non-negative values");
- }
- }
- return ScalarLogical(1);
-} catch (std::exception& e) {
- if (!isString(name) || LENGTH(name)!=1) { throw std::runtime_error("value-type name should be a string"); }
- std::stringstream final;
- final << CHAR(STRING_ELT(name, 0)) << " " << e.what();
- return mkString(final.str().c_str());
-}
-
-template <typename T>
-SEXP check_zero_fitted(const T* yptr, const int num_tags, const int num_libs, SEXP fitted, SEXP tolerance) {
- const int total_len=num_tags*num_libs;
-
- if (!isReal(fitted)) { throw std::runtime_error("fitted values must be double-precision"); }
- if (LENGTH(fitted)!=num_tags*num_libs) { throw std::runtime_error("dimensions of fitted and count matrices are not equal"); }
- const double* fptr=REAL(fitted);
-
- if (!isReal(tolerance) || LENGTH(tolerance)!=1) { throw std::runtime_error("tolerance must be a double-precision scalar"); }
- const double tol=asReal(tolerance);
-
- SEXP output=PROTECT(allocMatrix(LGLSXP, num_tags, num_libs));
- try {
- int* optr=LOGICAL(output);
- for (int i=0; i<total_len; ++i) {
- (*optr)=((*fptr < tol) & (*yptr < tol));
- ++optr;
- ++fptr;
- ++yptr;
- }
- } catch (std::exception& e) {
- UNPROTECT(1);
- throw;
- }
-
- UNPROTECT(1);
- return output;
-}
-
-SEXP R_check_zero_fitted(SEXP y, SEXP fitted, SEXP tolerance) try {
- count_holder counts(y);
- const int num_tags=counts.get_ntags();
- const int num_libs=counts.get_nlibs();
-
- if (counts.is_data_integer()){
- return check_zero_fitted<int>(counts.get_raw_int(), num_tags, num_libs, fitted, tolerance);
- } else {
- return check_zero_fitted<double>(counts.get_raw_double(), num_tags, num_libs, fitted, tolerance);
- }
-} catch (std::exception& e) {
- return mkString(e.what());
-}
diff --git a/src/R_check_poisson_bound.cpp b/src/R_check_poisson_bound.cpp
deleted file mode 100644
index 11268ed..0000000
--- a/src/R_check_poisson_bound.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-#include "matvec_check.h"
-
-/* Checks whether the variance is below the Poisson bound. */
-
-SEXP R_check_poisson_bound (SEXP fitted, SEXP disp, SEXP s2) try {
- if (!isReal(fitted)) { throw std::runtime_error("matrix of fitted values should be double-precision"); }
- const double* fptr=REAL(fitted);
-
- SEXP dims=getAttrib(fitted, R_DimSymbol);
- if (!isInteger(dims) || LENGTH(dims)!=2) {
- throw std::runtime_error("matrix dimensions should be an integer vector of length 2");
- }
- const int num_tags=INTEGER(dims)[0], num_libs=INTEGER(dims)[1];
- if (LENGTH(fitted)!=num_libs*num_tags) {
- throw std::runtime_error("recorded matrix dimensions are not consistent with its length");
- }
-
- matvec_check alld(disp, num_tags, num_libs);
- const double* const dptr2=alld.access();
- matvec_check alls(s2, num_tags, num_libs);
- const double* const sptr2=alls.access();
-
- SEXP output=PROTECT(allocVector(LGLSXP, num_tags));
- try {
- int* optr=LOGICAL(output);
- std::fill(optr, optr+num_tags, 0);
-
- int lib;
- for (int tag=0; tag<num_tags; ++tag) {
- for (lib=0; lib<num_libs; ++lib) {
- if ((fptr[lib*num_tags] * dptr2[lib] + 1) * sptr2[lib] < 1) {
- optr[tag]=1;
- break;
- }
- }
-
- ++fptr;
- alld.advance();
- alls.advance();
- }
- } catch (std::exception& e) {
- UNPROTECT(1);
- throw;
- }
-
- UNPROTECT(1);
- return output;
-} catch (std::exception& e){
- return mkString(e.what());
-}
diff --git a/src/R_compute_apl.cpp b/src/R_compute_apl.cpp
deleted file mode 100644
index a9936e9..0000000
--- a/src/R_compute_apl.cpp
+++ /dev/null
@@ -1,111 +0,0 @@
-#include "glm.h"
-#include "matvec_check.h"
-
-SEXP R_compute_apl(SEXP y, SEXP means, SEXP disps, SEXP weights, SEXP adjust, SEXP design) try {
- count_holder counts(y);
- const int num_tags=counts.get_ntags();
- const int num_libs=counts.get_nlibs();
- double* count_ptr=(double*)R_alloc(num_libs, sizeof(double));
-
- // Setting up the means.
- if (!isReal(means)) {
- throw std::runtime_error("mean matrix must be double-precision");
- }
- if (LENGTH(means)!=LENGTH(y)) {
- throw std::runtime_error("mean and count matrices must be of same size");
- }
- const double* mptr=REAL(means);
-
- // Setting up the dispersions and weights.
- matvec_check alld(disps, num_tags, num_libs);
- const double* const dptr2=alld.access();
- matvec_check allw(weights, num_tags, num_libs);
- const double* const wptr2=allw.access();
-
- // Determining whether we want to do the adjustment.
- if (!isLogical(adjust) || LENGTH(adjust)!=1) {
- throw std::runtime_error("'adjust' must be a logical scalar");
- }
- const bool do_adjust=asLogical(adjust);
- double* W_ptr=(double*)R_alloc(num_libs, sizeof(double));
-
- // Setting up the design matrix and the CR adjustment object.
- if (!isNumeric(design)) { throw std::runtime_error("design matrix must be double precision"); }
- const int num_coefs=LENGTH(design)/num_libs;
- if (num_coefs*num_libs!=LENGTH(design)) { throw std::runtime_error("dimensions of design matrix not consistent with number of libraries"); }
- adj_coxreid acr(num_libs, num_coefs, REAL(design));
-
- SEXP output=PROTECT(allocVector(REALSXP, num_tags));
- try {
- double* optr=REAL(output);
- double loglik, r, logmur, adj;
- int lib, index;
- for (int tag=0; tag<num_tags; ++tag) {
-
- /* First computing the log-likelihood. */
- double& sum_loglik=(optr[tag]=0);
- counts.fill_and_next(count_ptr);
- index=0;
-
- for (lib=0; lib<num_libs; ++lib, index+=num_tags) {
- const double& cury=count_ptr[lib];
- const double& curmu=mptr[index];
- if (curmu==0) {
- continue; // Should only be possible if count is zero, where the log-likelihood would then be 0.
- }
-
- if (dptr2[lib] > 0) {
- // same as loglik <- rowSums(weights*dnbinom(y,size=1/dispersion,mu=mu,log = TRUE))
- r=1/dptr2[lib];
- logmur=std::log(curmu+r);
- loglik = cury*std::log(curmu) - cury*logmur + r*std::log(r) - r*logmur + lgamma(cury+r) - lgamma(cury+1) - lgamma(r);
- } else {
- // same as loglik <- rowSums(weights*dpois(y,lambda=mu,log = TRUE))
- loglik = cury*std::log(curmu) - curmu - lgamma(cury+1);
- }
- sum_loglik += loglik * wptr2[lib]; // with weights.
- }
-
- if (do_adjust) {
- /* Computing 'W', the matrix of negative binomial probabilities.
- * The class computes 'XtWX' and performs an LDL decomposition
- * to compute the Cox-Reid adjustment factor.
- */
- for (lib=0; lib<num_libs; ++lib) {
- const double& curmu=mptr[lib*num_tags];
- W_ptr[lib] = wptr2[lib] * curmu/(1 + dptr2[lib] * curmu);
- }
-
- if (num_coefs==1) {
- adj=0;
- for (lib=0; lib<num_libs; ++lib) {
- adj+=W_ptr[lib];
- }
- adj=std::log(std::abs(adj))/2;
- } else {
- std::pair<double, bool> x=acr.compute(W_ptr);
- if (!x.second) {
- std::stringstream errout;
- errout << "LDL factorization failed for tag " << tag+1;
- throw std::runtime_error(errout.str());
- }
- adj=x.first;
- }
- sum_loglik-=adj;
- }
-
- ++mptr;
- alld.advance();
- allw.advance();
- }
- } catch (std::exception& e) {
- UNPROTECT(1);
- throw;
- }
-
- UNPROTECT(1);
- return output;
-} catch (std::exception& e) {
- return mkString(e.what());
-}
-
diff --git a/src/R_compute_nbdev.cpp b/src/R_compute_nbdev.cpp
index 0f54c37..dd6551f 100644
--- a/src/R_compute_nbdev.cpp
+++ b/src/R_compute_nbdev.cpp
@@ -1,85 +1,40 @@
#include "utils.h"
#include "glm.h"
-#include "matvec_check.h"
-SEXP R_compute_nbdev (SEXP y, SEXP mu, SEXP phi, SEXP weights, SEXP dosum) try {
- count_holder counts(y);
- const int num_tags=counts.get_ntags();
- const int num_libs=counts.get_nlibs();
- double* count_ptr=(double*)R_alloc(num_libs, sizeof(double));
-
- // Setting up means.
- if (!isReal(mu)) { throw std::runtime_error("matrix of means should be double-precision"); }
- if (LENGTH(mu)!=num_tags*num_libs) { throw std::runtime_error("length of means is not consistent with length of counts"); }
- const double* mptr=REAL(mu);
-
- // Setting up dispersions.
- matvec_check alld(phi, num_tags, num_libs);
- const double* const dptr2=alld.access();
-
- // Seeing if we have to sum things together.
- if (!isLogical(dosum) || LENGTH(dosum)!=1) {
- throw std::runtime_error("summation specification should be a logical vector");
- }
- const bool sumtogether=asLogical(dosum);
- int tag, lib, index;
-
- if (sumtogether) {
- // Setting up weights.
- matvec_check allw(weights, num_tags, num_libs);
- const double* const wptr2=allw.access();
-
- SEXP output=PROTECT(allocVector(REALSXP, num_tags));
- try {
- // Running through each row and computing the unit deviance, and then computing the weighted sum.
- double* optr=REAL(output);
- for (tag=0; tag<num_tags; ++tag) {
- counts.fill_and_next(count_ptr);
-
- double& current_sumdev=(optr[tag]=0);
- index=0;
- for (lib=0; lib<num_libs; ++lib) {
- current_sumdev += compute_unit_nb_deviance(count_ptr[lib], mptr[index], dptr2[lib]) * wptr2[lib];
- index+=num_tags;
- }
-
- ++mptr;
- alld.advance();
- allw.advance();
- }
- } catch (std::exception& e) {
- UNPROTECT(1);
- throw;
- }
-
- UNPROTECT(1);
- return output;
- } else {
- SEXP output=PROTECT(allocMatrix(REALSXP, num_tags, num_libs));
- try {
- // Computing unit deviance for each observation.
- double* optr=REAL(output);
- for (tag=0; tag<num_tags; ++tag) {
- counts.fill_and_next(count_ptr);
-
- index=0;
- for (lib=0; lib<num_libs; ++lib) {
- optr[index] = compute_unit_nb_deviance(count_ptr[lib], mptr[index], dptr2[lib]);
- index += num_tags;
- }
-
- ++optr;
- ++mptr;
- alld.advance();
- }
- } catch (std::exception& e) {
- UNPROTECT(1);
- throw;
- }
-
- UNPROTECT(1);
- return output;
- }
+SEXP R_compute_nbdev (SEXP y, SEXP mu, SEXP phi) try {
+ if (!isNumeric(phi)) { throw std::runtime_error("dispersion vector should be double-precision"); }
+ const int ntags=LENGTH(phi);
+ if (!isNumeric(y)) { throw std::runtime_error("count matrix should be double-precision"); }
+ if (!isNumeric(mu)) { throw std::runtime_error("matrix of means should be double-precision"); }
+ const int nlib=LENGTH(mu)/ntags;
+ if (nlib*ntags !=LENGTH(mu)) { throw std::runtime_error("mean matrix has inconsistent dimensions"); }
+ if (LENGTH(mu)!=LENGTH(y)) { throw std::runtime_error("count and mean matrices should have same dimensions"); }
+
+ const double* yptr=REAL(y);
+ const double* mptr=REAL(mu);
+ const double* dptr=REAL(phi);
+
+ // Running through each row and computing the unit deviance, and then that sum.
+ SEXP output=PROTECT(allocMatrix(REALSXP, ntags, nlib));
+ try {
+ double* optr=REAL(output);
+ int counter;
+ for (int i=0; i<ntags; ++i) {
+ counter=0;
+ for (int j=0; j<nlib; ++j, counter+=ntags) {
+ optr[counter]=compute_unit_nb_deviance(yptr[counter], mptr[counter], dptr[i]);
+ }
+ ++optr;
+ ++yptr;
+ ++mptr;
+ }
+ } catch (std::exception& e) {
+ UNPROTECT(1);
+ throw;
+ }
+
+ UNPROTECT(1);
+ return output;
} catch(std::exception& e) {
return mkString(e.what());
}
diff --git a/src/R_cr_adjust.cpp b/src/R_cr_adjust.cpp
new file mode 100644
index 0000000..4ed4a81
--- /dev/null
+++ b/src/R_cr_adjust.cpp
@@ -0,0 +1,43 @@
+#include "glm.h"
+
+/* 'w' represents a matrix of negative binomial probabilities (i.e.
+ * the prob of success/failure, a function of mean and dispersion)
+ * whereas 'x' represents the design matrix. This function calculates
+ * the multiplication of the matrices, then performs a Cholesky decomposition
+ * to get the lower triangular matrix 'L'. The diagonal of 'L' can then
+ * be used to compute the Cox-Reid adjustment factor.
+ */
+SEXP R_cr_adjust (SEXP w, SEXP x, SEXP nlibs) try {
+ if (!isNumeric(w)) { throw std::runtime_error("matrix of likelihoods must be double precision"); }
+ if (!isNumeric(x)) { throw std::runtime_error("design matrix must be double precision"); }
+
+ const int num_libs=asInteger(nlibs);
+ const int num_tags=LENGTH(w)/num_libs;
+ const int num_coefs=LENGTH(x)/num_libs;
+
+ // Setting up a couple of indices for quick access.
+ adj_coxreid acr(num_libs, num_coefs, REAL(x));
+ const double* wptr=REAL(w);
+
+ SEXP output=PROTECT(allocVector(REALSXP, num_tags));
+ double* out_ptr=REAL(output);
+ try {
+ // Running through each tag.
+ for (int tag=0; tag<num_tags; ++tag) {
+ std::pair<double, bool> x=acr.compute(wptr);
+ if (!x.second) {
+ std::stringstream errout;
+ errout << "LDL factorization failed for tag " << tag+1;
+ throw std::runtime_error(errout.str());
+ }
+ out_ptr[tag]=x.first;
+ wptr+=num_libs;
+ }
+ } catch (std::exception& e) {
+ UNPROTECT(1);
+ throw;
+ }
+
+ UNPROTECT(1);
+ return output;
+} catch (std::exception& e) { return mkString(e.what()); }
diff --git a/src/R_get_one_way_fitted.cpp b/src/R_get_one_way_fitted.cpp
deleted file mode 100644
index e528de7..0000000
--- a/src/R_get_one_way_fitted.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-#include "glm.h"
-#include "matvec_check.h"
-
-/*** Function to compute the fitted values without a lot of temporary matrices. ***/
-
-SEXP R_get_one_way_fitted (SEXP beta, SEXP offset, SEXP groups) try {
- SEXP dims=getAttrib(beta, R_DimSymbol);
- if (!isInteger(dims) || LENGTH(dims)!=2) {
- throw std::runtime_error("matrix dimensions should be an integer vector of length 2");
- }
- int num_tags=INTEGER(dims)[0];
- int num_coefs=INTEGER(dims)[1];
-
- if (!isReal(beta)) {
- throw std::runtime_error("beta matrix should be double-precision");
- }
- if (LENGTH(beta)!=num_tags*num_coefs) {
- throw std::runtime_error("recorded matrix dimensions are not consistent with its length");
- }
- const double* bptr=REAL(beta);
- double* bptr2=(double*)R_alloc(num_coefs, sizeof(double));
-
- if (!isInteger(groups)) {
- throw std::runtime_error("grouping vector should be integer");
- }
- int num_libs=LENGTH(groups);
- int* gptr=INTEGER(groups);
-
- matvec_check allo(offset, num_tags, num_libs);
- const double* optr2=allo.access();
-
- SEXP output=PROTECT(allocMatrix(REALSXP, num_tags, num_libs));
- try {
- double* outptr=REAL(output);
- int lib, coef;
- for (int tag=0; tag<num_tags; ++tag) {
- // Storing to a single vector for faster caching.
- for (coef=0; coef<num_coefs; ++coef) {
- bptr2[coef]=bptr[coef*num_tags];
- }
- ++bptr;
-
- // Caching is going to be suboptimal, but oh well.
- for (lib=0; lib<num_libs; ++lib) {
- outptr[lib*num_tags]=std::exp(optr2[lib] + bptr2[gptr[lib]]);
- }
- ++outptr;
- allo.advance();
- }
- } catch (std::exception& e) {
- UNPROTECT(1);
- throw;
- }
-
- UNPROTECT(1);
- return output;
-} catch (std::exception& e) {
- return mkString(e.what());
-}
-
diff --git a/src/R_initialize_levenberg.cpp b/src/R_initialize_levenberg.cpp
deleted file mode 100644
index 2bd4a01..0000000
--- a/src/R_initialize_levenberg.cpp
+++ /dev/null
@@ -1,209 +0,0 @@
-#include "glm.h"
-#include "matvec_check.h"
-
-/* Different initialization methods for the Levenberg coefficients */
-
-const char side='L';
-const char trans_ormqr='T';
-const char uplo='U';
-const char trans_trtrs='N';
-const char diag='N';
-const int unity=1;
-
-struct QRdecomposition {
- QRdecomposition(const double* curX, const int nrows, const int ncoefs) : X(curX), NR(nrows), NC(ncoefs) {
- Xcopy=(double*)R_alloc(NR*NC, sizeof(double));
- tau=(double*)R_alloc(NC, sizeof(double));
-
- // Also setting up a vector for weights and effects.
- weights=(double*)R_alloc(NR, sizeof(double));
- effects=(double*)R_alloc(NR, sizeof(double));
-
- // Setting up the workspace for dgeqrf.
- double tmpwork;
- lwork_geqrf=lwork_ormqr=-1;
- F77_CALL(dgeqrf)(&NR, &NC, Xcopy, &NR, tau, &tmpwork, &lwork_geqrf, &info);
-
- // Loading up the optimal WORK.
- lwork_geqrf=tmpwork+0.5;
- if (lwork_geqrf < 1) { lwork_geqrf = 1; }
- work_geqrf=(double*)R_alloc(lwork_geqrf, sizeof(double));
-
- // Repeating for dormqr
- F77_CALL(dormqr)(&side, &trans_ormqr, &NR, &unity, &NC, Xcopy, &NR, tau, effects, &NR, &tmpwork, &lwork_ormqr, &info);
- lwork_ormqr=tmpwork+0.5;
- if (lwork_ormqr < 1) { lwork_ormqr = 1; }
- work_ormqr=(double*)R_alloc(lwork_ormqr, sizeof(double));
-
- return;
- }
-
- void store_weights(const double* w) {
- if (w==NULL) {
- std::fill(weights, weights+NR, 1);
- } else {
- for (row=0; row<NR; ++row) {
- weights[row]=std::sqrt(w[row]);
- }
- }
- return;
- }
-
- void decompose() {
- std::copy(X, X+NR*NC, Xcopy);
- index=0;
- for (coef=0; coef<NC; ++coef) {
- for (row=0; row<NR; ++row) {
- Xcopy[index]*=weights[row];
- ++index;
- }
- }
-
- F77_CALL(dgeqrf)(&NR, &NC, Xcopy, &NR, tau, work_geqrf, &lwork_geqrf, &info);
- if (info) {
- throw std::runtime_error("QR decomposition failed");
- }
- return;
- }
-
- void solve(const double * y) {
- for (row=0; row<NR; ++row) {
- effects[row]=y[row]*weights[row];
- }
-
- F77_CALL(dormqr)(&side, &trans_ormqr, &NR, &unity, &NC, Xcopy, &NR, tau, effects, &NR, work_ormqr, &lwork_ormqr, &info);
- if (info) {
- throw std::runtime_error("Q**T multiplication failed");
- }
-
- F77_CALL(dtrtrs)(&uplo, &trans_trtrs, &diag, &NC, &unity, Xcopy, &NR, effects, &NR, &info);
- if (info) {
- throw std::runtime_error("failed to solve the triangular system");
- }
-
- return;
- }
-
- const double* X;
- double * Xcopy, * tau, * effects, *weights, *work_geqrf, *work_ormqr;
- const int NR, NC;
- int lwork_geqrf, lwork_ormqr, info;
- int row, coef, index;
-};
-
-
-SEXP R_get_levenberg_start(SEXP y, SEXP offset, SEXP disp, SEXP weights, SEXP design, SEXP use_null) try {
- if (!isReal(design)) { throw std::runtime_error("design matrix should be double precision"); }
- count_holder counts(y);
- const int num_tags=counts.get_ntags();
- const int num_libs=counts.get_nlibs();
- double* count_ptr=(double*)R_alloc(num_libs, sizeof(double));
-
- // Getting and checking the dimensions of the arguments.
- const int dlen=LENGTH(design);
- if (dlen%num_libs!=0) { throw std::runtime_error("size of design matrix is incompatible with number of libraries"); }
- const int num_coefs=dlen/num_libs;
-
- // Initializing pointers to the assorted features.
- const double *design_ptr=REAL(design);
- matvec_check allo(offset, num_tags, num_libs);
- const double* const optr2=allo.access();
- matvec_check allw(weights, num_tags, num_libs);
- const double* const wptr2=allw.access();
- matvec_check alld(disp, num_tags, num_libs);
- const double* const dptr2=alld.access();
-
- // Determining what type of algorithm is to be used.
- if (!isLogical(use_null) || LENGTH(use_null)!=1) {
- throw std::runtime_error("'use_null' specification should be a logical scalar");
- }
- const bool null_method=asLogical(use_null);
-
- SEXP output_beta=PROTECT(allocMatrix(REALSXP, num_tags, num_coefs));
- try {
- double* obptr=REAL(output_beta);
- QRdecomposition QR(design_ptr, num_libs, num_coefs);
- double* tmp_exprs=(double*)R_alloc(num_libs, sizeof(double));
- int lib, coef;
-
- if (null_method) {
- QR.store_weights(NULL);
- QR.decompose();
- double sum_exprs=0, sum_weight=0, curN, curweight;
-
- for (int tag=0; tag<num_tags; ++tag) {
- counts.fill_and_next(count_ptr);
-
- // Computing weighted average of the count:library size ratios.
- sum_weight=0;
- sum_exprs=0;
- for (lib=0; lib<num_libs; ++lib) {
- curN=std::exp(optr2[lib]);
- curweight=wptr2[lib]*curN/(1 + dptr2[lib] * curN);
- sum_exprs += count_ptr[lib] * curweight / curN;
- sum_weight += curweight;
- }
- std::fill(tmp_exprs, tmp_exprs + num_libs, std::log(sum_exprs/sum_weight));
-
- // Performing the QR decomposition and taking the solution.
- QR.solve(tmp_exprs);
- for (coef=0; coef<num_coefs; ++coef) {
- obptr[coef*num_tags]=QR.effects[coef];
- }
-
- ++obptr;
- allo.advance();
- allw.advance();
- alld.advance();
- }
- } else {
- bool weights_are_the_same=allw.is_row_repeated();
- if (weights_are_the_same) {
- QR.store_weights(wptr2);
- QR.decompose();
- }
-
- // Finding the delta.
- double delta;
- if (counts.is_data_integer()) {
- const int* ciptr=counts.get_raw_int();
- delta=*std::max_element(ciptr, ciptr+num_libs*num_tags);
- } else {
- const double* cdptr=counts.get_raw_double();
- delta=*std::max_element(cdptr, cdptr+num_libs*num_tags);
- }
- delta=std::min(delta, 1.0/6);
-
- for (int tag=0; tag<num_tags; ++tag) {
- if (!weights_are_the_same) {
- QR.store_weights(wptr2);
- QR.decompose();
- allw.advance();
- }
- counts.fill_and_next(count_ptr);
-
- // Computing normalized log-expression values.
- for (lib=0; lib<num_libs; ++lib) {
- tmp_exprs[lib]=std::log(std::max(delta, count_ptr[lib])) - optr2[lib];
- }
-
- // Performing the QR decomposition and taking the solution.
- QR.solve(tmp_exprs);
- for (coef=0; coef<num_coefs; ++coef) {
- obptr[coef*num_tags]=QR.effects[coef];
- }
-
- ++obptr;
- allo.advance();
- }
- }
- } catch (std::exception& e){
- UNPROTECT(1);
- throw;
- }
-
- UNPROTECT(1);
- return output_beta;
-} catch (std::exception& e) {
- return mkString(e.what());
-}
diff --git a/src/R_levenberg.cpp b/src/R_levenberg.cpp
index e98173e..5aa701b 100644
--- a/src/R_levenberg.cpp
+++ b/src/R_levenberg.cpp
@@ -1,48 +1,56 @@
#include "glm.h"
#include "matvec_check.h"
-SEXP R_levenberg (SEXP design, SEXP y, SEXP disp, SEXP offset, SEXP weights,
- SEXP beta, SEXP tol, SEXP maxit) try {
- count_holder counts(y);
- const int num_tags=counts.get_ntags();
- const int num_libs=counts.get_nlibs();
+SEXP R_levenberg (SEXP nlib, SEXP ntag, SEXP design, SEXP counts, SEXP disp, SEXP offset, SEXP weights,
+ SEXP beta, SEXP fitted, SEXP tol, SEXP maxit) try {
+ if (!isNumeric(design)) { throw std::runtime_error("design matrix should be double precision"); }
+ if (!isNumeric(disp)) { throw std::runtime_error("dispersion matrix should be double precision"); }
+ if (!isNumeric(beta)) { throw std::runtime_error("matrix of start values for coefficients should be double precision"); }
+ if (!isNumeric(fitted)) { throw std::runtime_error("matrix of starting fitted values should be double precision"); }
+ const int num_tags=asInteger(ntag);
+ const int num_libs=asInteger(nlib);
+
+ // Checking the count matrix.
+ const double *cdptr=NULL;
+ const int* ciptr=NULL;
double* count_ptr=(double*)R_alloc(num_libs, sizeof(double));
+ bool is_integer=isInteger(counts);
+ if (is_integer) {
+ ciptr=INTEGER(counts);
+ } else {
+ if (!isNumeric(counts)) { throw std::runtime_error("count matrix must be integer or double-precision"); }
+ cdptr=REAL(counts);
+ }
// Getting and checking the dimensions of the arguments.
- if (!isReal(design)) { throw std::runtime_error("design matrix should be double precision"); }
- SEXP des_dims=getAttrib(design, R_DimSymbol);
- if (!isInteger(des_dims) || LENGTH(des_dims)!=2) {
- throw std::runtime_error("design matrix dimensions should be an integer vector of length 2");
- }
- if (INTEGER(des_dims)[0]!=num_libs) {
- throw std::runtime_error("number of rows of design matrix should be equal to number of libraries");
- }
- const int num_coefs=INTEGER(des_dims)[1];
-
- if (!isReal(beta)) { throw std::runtime_error("starting coefficient values must be positive"); }
- SEXP bdims=getAttrib(beta, R_DimSymbol);
- if (!isInteger(bdims) || LENGTH(bdims)!=2) {
- throw std::runtime_error("beta matrix dimensions should be an integer vector of length 2");
- }
- if (INTEGER(bdims)[0]!=num_tags) {
- throw std::runtime_error("number of rows of beta matrix should be equal to number of genes");
- } else if (INTEGER(bdims)[1]!=num_coefs) {
- throw std::runtime_error("number of columns of beta matrix should be equal to number of coefficients");
- }
+ const int dlen=LENGTH(design);
+ const int clen=LENGTH(counts);
+ if (dlen%num_libs!=0) { throw std::runtime_error("size of design matrix is incompatible with number of libraries"); }
+ const int num_coefs=dlen/num_libs;
+ if (clen!=num_tags*num_libs) {
+ throw std::runtime_error("dimensions of the count matrix are not as specified");
+ } else if (LENGTH(beta)!=num_tags*num_coefs) {
+ throw std::runtime_error("dimensions of the beta matrix do not match to the number of tags and coefficients");
+ } else if (LENGTH(fitted)!=clen) {
+ throw std::runtime_error("dimensions of the fitted matrix do not match those of the count matrix");
+ } else if (LENGTH(disp)!=clen) {
+ throw std::runtime_error("dimensions of dispersion matrix is not as specified");
+ }
// Initializing pointers to the assorted features.
- const double *design_ptr=REAL(design), *bptr=REAL(beta);
- matvec_check allo(offset, num_tags, num_libs);
- const double* const optr2=allo.access();
- matvec_check allw(weights, num_tags, num_libs);
- const double* const wptr2=allw.access();
- matvec_check alld(disp, num_tags, num_libs);
- const double* const dptr2=alld.access();
+ const double* beta_ptr=REAL(beta),
+ *design_ptr=REAL(design),
+ *fitted_ptr=REAL(fitted),
+ *disp_ptr=REAL(disp);
+ matvec_check allo(num_libs, num_tags, offset, true, "offset");
+ const double* const* optr2=allo.access();
+ matvec_check allw(num_libs, num_tags, weights, true, "weight", 1);
+ const double* const* wptr2=allw.access();
// Initializing output cages.
SEXP output=PROTECT(allocVector(VECSXP, 5));
- SET_VECTOR_ELT(output, 0, allocMatrix(REALSXP, num_tags, num_coefs)); // beta
- SET_VECTOR_ELT(output, 1, allocMatrix(REALSXP, num_tags, num_libs)); // new fitted
+ SET_VECTOR_ELT(output, 0, allocMatrix(REALSXP, num_coefs, num_tags)); // beta (transposed)
+ SET_VECTOR_ELT(output, 1, allocMatrix(REALSXP, num_libs, num_tags)); // new fitted (transposed)
SET_VECTOR_ELT(output, 2, allocVector(REALSXP, num_tags));
SET_VECTOR_ELT(output, 3, allocVector(INTSXP, num_tags));
SET_VECTOR_ELT(output, 4, allocVector(LGLSXP, num_tags));
@@ -51,41 +59,41 @@ SEXP R_levenberg (SEXP design, SEXP y, SEXP disp, SEXP offset, SEXP weights,
double* dev_ptr=REAL(VECTOR_ELT(output, 2));
int* iter_ptr=INTEGER(VECTOR_ELT(output, 3));
int* fail_ptr=LOGICAL(VECTOR_ELT(output, 4));
-
- double* tmp_beta_ptr=(double*)R_alloc(num_coefs, sizeof(double));
- double* tmp_fitted_ptr=(double*)R_alloc(num_libs, sizeof(double));
-
try {
// Running through each tag and fitting the NB GLM.
glm_levenberg glbg(num_libs, num_coefs, design_ptr, asInteger(maxit), asReal(tol));
- int lib, coef;
for (int tag=0; tag<num_tags; ++tag) {
- counts.fill_and_next(count_ptr);
-
- // Copying elements to the tmp_beta as these are modified in-place.
- for (coef=0; coef<num_coefs; ++coef) {
- tmp_beta_ptr[coef]=bptr[coef*num_tags];
- }
- ++bptr;
- if (glbg.fit(optr2, count_ptr, wptr2, dptr2, tmp_fitted_ptr, tmp_beta_ptr)) {
- std::stringstream errout;
+ // Copying integer/double counts to a new vector.
+ if (is_integer) {
+ for (int i=0; i<num_libs; ++i) { count_ptr[i]=double(ciptr[i]); }
+ ciptr+=num_libs;
+ } else {
+ for (int i=0; i<num_libs; ++i) { count_ptr[i]=cdptr[i]; }
+ cdptr+=num_libs;
+ }
+
+ // Copying elements to the new_beta and new_fitted, so output is automatically stored.
+ for (int i=0; i<num_libs; ++i) { new_fitted_ptr[i]=fitted_ptr[i]; }
+ for (int i=0; i<num_coefs; ++i) { new_beta_ptr[i]=beta_ptr[i]; }
+ if (glbg.fit(*optr2, count_ptr,
+#ifdef WEIGHTED
+ *wptr2,
+#endif
+ disp_ptr, new_fitted_ptr, new_beta_ptr)) {
+ std::stringstream errout;
errout<< "solution using Cholesky decomposition failed for tag " << tag+1;
throw std::runtime_error(errout.str());
}
allo.advance();
allw.advance();
- alld.advance();
- for (lib=0; lib<num_libs; ++lib) {
- new_fitted_ptr[lib*num_tags]=tmp_fitted_ptr[lib];
- }
- ++new_fitted_ptr;
- for (coef=0; coef<num_coefs; ++coef) {
- new_beta_ptr[coef*num_tags]=tmp_beta_ptr[coef];
- }
- ++new_beta_ptr;
-
+ disp_ptr+=num_libs;
+ fitted_ptr+=num_libs;
+ new_fitted_ptr+=num_libs;
+ beta_ptr+=num_coefs;
+ new_beta_ptr+=num_coefs;
+
*(dev_ptr++)=glbg.get_deviance();
*(iter_ptr++)=glbg.get_iterations();
*(fail_ptr++)=glbg.is_failure();
@@ -98,4 +106,3 @@ SEXP R_levenberg (SEXP design, SEXP y, SEXP disp, SEXP offset, SEXP weights,
UNPROTECT(1);
return output;
} catch (std::exception& e) { return mkString(e.what()); }
-
diff --git a/src/R_one_group.cpp b/src/R_one_group.cpp
index 81b5b44..83d36cd 100644
--- a/src/R_one_group.cpp
+++ b/src/R_one_group.cpp
@@ -1,108 +1,78 @@
#include "glm.h"
#include "matvec_check.h"
-template <typename T>
-bool is_array_equal_to(const T* x, const int n, const bool rep, const T& v) {
- if (rep) {
- return (n>0 && x[0]==v);
- } else {
- for (int i=0; i<n; ++i) {
- if (x[i]!=v) { return false; }
- }
- return true;
- }
-}
-
-SEXP R_one_group (SEXP y, SEXP disp, SEXP offsets, SEXP weights, SEXP max_iterations, SEXP tolerance, SEXP beta) try {
- count_holder counts(y);
- const int num_tags=counts.get_ntags();
- const int num_libs=counts.get_nlibs();
- double* yptr=(double*)R_alloc(num_libs, sizeof(double));
-
- // Setting up assorted input matrices.
- matvec_check allo(offsets, num_tags, num_libs);
- const double* const optr2=allo.access();
- matvec_check allw(weights, num_tags, num_libs);
- const double* const wptr2=allw.access();
- matvec_check alld(disp, num_tags, num_libs);
- const double* const dptr2=alld.access();
- matvec_check allb(beta, num_tags, 1); // only one coefficient.
- const double* const bptr2=allb.access();
-
- // GLM iterations.
+SEXP R_one_group (SEXP nlib, SEXP ntag, SEXP y, SEXP disp, SEXP offsets, SEXP weights, SEXP max_iterations, SEXP tolerance, SEXP beta) try {
+ const int num_tags=asInteger(ntag);
+ const int num_libs=asInteger(nlib);
+ if (!isNumeric(disp)) { throw std::runtime_error("dispersion matrix must be double precision"); }
+ if (num_tags*num_libs !=LENGTH(disp)) { throw std::runtime_error("dimensions of dispersion vector is not equal to number of tags"); }
+ if (num_tags*num_libs != LENGTH(y) ) { throw std::runtime_error("dimensions of the count table are not as specified"); } // Checking that it is an exact division.
+
+ if (!isNumeric(beta)) { throw std::runtime_error("beta start vector must be double precision"); }
+ const int blen=LENGTH(beta);
+ const bool use_old_start=(blen!=0);
+ if (use_old_start && blen!=num_tags) {
+ throw std::runtime_error("non-empty start vector must have length equal to the number of tags");
+ }
+ const double* bsptr=REAL(beta);
+
const int maxit=asInteger(max_iterations);
const double tol=asReal(tolerance);
+
+ // Setting up some iterators. We provide some flexibility to detecting numeric-ness.
+ double *ydptr=NULL;
+ int* yiptr=NULL;
+ double* yptr=(double*)R_alloc(num_libs, sizeof(double));
+ bool is_integer=isInteger(y);
+ if (is_integer) {
+ yiptr=INTEGER(y);
+ } else {
+ if (!isNumeric(y)) { throw std::runtime_error("count matrix must be integer or double-precision"); }
+ ydptr=REAL(y);
+ }
+ matvec_check allo(num_libs, num_tags, offsets, false, "offset");
+ const double* const* optr2=allo.access();
+ matvec_check allw(num_libs, num_tags, weights, false, "weight", 1);
+ const double* const* wptr2=allw.access();
+ const double* dptr=REAL(disp);
- // Setting up beta for output.
+ // Setting up beta for output.
SEXP output=PROTECT(allocVector(VECSXP, 2));
SET_VECTOR_ELT(output, 0, allocVector(REALSXP, num_tags));
SET_VECTOR_ELT(output, 1, allocVector(LGLSXP, num_tags));
double* bptr=REAL(VECTOR_ELT(output, 0));
int* cptr=LOGICAL(VECTOR_ELT(output, 1));
try {
-
- // Preparing for possible Poisson sums.
- bool disp_is_zero=true, weight_is_one=true;
- double sum_counts, sum_lib=0;
- int lib;
- if (allo.is_row_repeated()) {
- for (lib=0; lib<num_libs; ++lib) {
- sum_lib+=std::exp(optr2[lib]);
- }
- }
- if (alld.is_row_repeated()) {
- disp_is_zero=is_array_equal_to<double>(dptr2, num_libs, alld.is_col_repeated(), 0);
- }
- if (allw.is_row_repeated()) {
- weight_is_one=is_array_equal_to<double>(wptr2, num_libs, allw.is_col_repeated(), 1);
- }
-
+
// Iterating through tags and fitting.
+ int lib=0;
for (int tag=0; tag<num_tags; ++tag) {
- counts.fill_and_next(yptr);
-
- // Checking for the Poisson special case with all-unity weights and all-zero dispersions.
- if (!alld.is_row_repeated()) {
- disp_is_zero=is_array_equal_to<double>(dptr2, num_libs, alld.is_col_repeated(), 0);
- }
- if (!allw.is_row_repeated()) {
- weight_is_one=is_array_equal_to<double>(wptr2, num_libs, allw.is_col_repeated(), 1);
- }
-
- if (disp_is_zero && weight_is_one) {
- if (!allo.is_row_repeated()) {
- // Only recalculate sum of library sizes if it has changed.
- sum_lib=0;
- for (lib=0; lib<num_libs; ++lib) { sum_lib+=std::exp(optr2[lib]); }
- }
-
- sum_counts=0;
- for (lib=0; lib<num_libs; ++lib) { sum_counts+=yptr[lib]; }
- if (sum_counts==0) {
- bptr[tag]=R_NegInf;
- } else {
- bptr[tag]=std::log(sum_counts/sum_lib);
- }
- cptr[tag]=true;
- } else {
- // Otherwise going through NR iterations.
- std::pair<double, bool> out=glm_one_group(num_libs, maxit, tol, optr2, wptr2, yptr, dptr2, *bptr2);
- bptr[tag]=out.first;
- cptr[tag]=out.second;
- }
+ if (is_integer) {
+ for (lib=0; lib<num_libs; ++lib) { yptr[lib]=double(yiptr[lib]); }
+ yiptr+=num_libs;
+ } else {
+ yptr=ydptr;
+ ydptr+=num_libs;
+ }
+ std::pair<double, bool> out=glm_one_group(num_libs, maxit, tol, *optr2,
+#ifdef WEIGHTED
+ *wptr2,
+#endif
+ yptr, dptr, (use_old_start ? bsptr[tag] : R_NaReal));
+ bptr[tag]=out.first;
+ cptr[tag]=out.second;
+ dptr+=num_libs;
allo.advance();
allw.advance();
- alld.advance();
- allb.advance();
}
- } catch (std::exception& e) {
+ } catch (std::exception& e) {
UNPROTECT(1);
- throw;
+ throw;
}
// Returning everything as a list.
- UNPROTECT(1);
+ UNPROTECT(1);
return output;
} catch (std::exception& e) {
return mkString(e.what());
diff --git a/src/add_prior.cpp b/src/add_prior.cpp
deleted file mode 100644
index d986061..0000000
--- a/src/add_prior.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-#include "add_prior.h"
-
-add_prior::add_prior(int nt, int nl, SEXP priors, SEXP offsets, bool login, bool logout) : num_tags(nt), num_libs(nl),
- allp(priors, num_tags, num_libs), allo(offsets, num_tags, num_libs), pptr2(allp.access()), optr2(allo.access()),
- logged_in(login), logged_out(logout), tagdex(0) {
- adj_prior=(double*)R_alloc(num_libs, sizeof(double));
- adj_libs=(double*)R_alloc(num_libs, sizeof(double));
- return;
-}
-
-void add_prior::fill_and_next() {
- if (same_across_rows() && tagdex!=0) {
- // Skipping if all rows are the same, and we've already filled it in once.
- return;
- }
-
- ave_lib=0;
- for (lib=0; lib<num_libs; ++lib) {
- if (logged_in) { // unlogging to get library sizes, if they were originally logged offsets.
- adj_libs[lib]=std::exp(optr2[lib]);
- } else {
- adj_libs[lib]=optr2[lib];
- }
- ave_lib+=adj_libs[lib];
- }
- ave_lib/=num_libs;
-
- // Computing the adjusted prior count for each library.
- for (lib=0; lib<num_libs; ++lib) {
- adj_prior[lib]=pptr2[lib]*adj_libs[lib]/ave_lib;
- }
-
- // Adding it twice back to the library size, and log-transforming.
- for (lib=0; lib<num_libs; ++lib) {
- double& curval=adj_libs[lib];
- curval+=2*adj_prior[lib];
- if (logged_out) {
- curval=std::log(curval);
- }
- }
-
- ++tagdex;
- allp.advance();
- allo.advance();
- return;
-}
-
-const double* const add_prior::get_priors() const { return adj_prior; }
-
-const double* const add_prior::get_offsets() const { return adj_libs; }
-
-const bool add_prior::same_across_rows() const {
- return (allp.is_row_repeated() && allo.is_row_repeated());
-}
diff --git a/src/add_prior.h b/src/add_prior.h
deleted file mode 100644
index def47d9..0000000
--- a/src/add_prior.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef ADD_PRIOR_H
-#define ADD_PRIOR_H
-#include "matvec_check.h"
-#include "utils.h"
-
-class add_prior{
-public:
- add_prior(int, int, SEXP, SEXP, bool, bool);
- void fill_and_next();
- const double* const get_priors() const;
- const double* const get_offsets() const;
- const bool same_across_rows() const;
-private:
- double* adj_prior, *adj_libs;
- const int num_tags, num_libs;
-
- matvec_check allp, allo;
- const double* const pptr2, *const optr2;
- const bool logged_in, logged_out;
-
- int lib, tagdex;
- double ave_lib;
-};
-
-#endif
diff --git a/src/adj_coxreid.cpp b/src/adj_coxreid.cpp
index b5da08c..13b2fdd 100644
--- a/src/adj_coxreid.cpp
+++ b/src/adj_coxreid.cpp
@@ -13,14 +13,13 @@ adj_coxreid::adj_coxreid (const int& nl, const int& nc, const double* d) : ncoef
* reallocate the work pointer to this value.
*/
double temp_work;
- F77_CALL(dsytrf)(&uplo, &ncoefs, working_matrix, &ncoefs, pivots, &temp_work, &lwork, &info);
+ F77_NAME(dsytrf)(&uplo, &ncoefs, working_matrix, &ncoefs, pivots, &temp_work, &lwork, &info);
if (info) {
delete [] pivots;
delete [] working_matrix;
throw std::runtime_error("failed to identify optimal size of workspace through ILAENV");
}
lwork=int(temp_work+0.5);
- if (lwork < 1) { lwork = 1; }
work=new double [lwork];
// Saving a local copy of the design pointer.
@@ -38,7 +37,12 @@ adj_coxreid::~adj_coxreid () {
}
std::pair<double, bool> adj_coxreid::compute(const double* wptr) {
- // Setting working weight_matrix to 'A=Xt %*% diag(W) %*% X' with column-major storage.
+ /* Setting working weight_matrix to 'A=Xt %*% diag(W) %*% X' with column-major storage.
+ * This represents the expected Fisher information. The overall strategy is
+ * to compute the determinant of the information matrix in order to compute
+ * the adjustment factor for the likelihood (in order to account for uncertainty
+ * in the nuisance parameters i.e. the fitted values).
+ */
for (int row=0; row<ncoefs; ++row) {
for (int col=0; col<=row; ++col) {
double& cur_entry=(working_matrix[col*ncoefs+row]=0);
@@ -48,11 +52,25 @@ std::pair<double, bool> adj_coxreid::compute(const double* wptr) {
}
}
- // LDL* decomposition.
- F77_CALL(dsytrf)(&uplo, &ncoefs, working_matrix, &ncoefs, pivots, work, &lwork, &info);
+ /* We now apply the Cholesky decomposition using the appropriate routine from the
+ * LAPACK library. Specifically, we call the routine to do a symmetric indefinite
+ * factorisation i.e. A = LDLt. This guarantees factorization for singular matrices
+ * when the actual Cholesky decomposition would fail.
+ */
+ F77_NAME(dsytrf)(&uplo, &ncoefs, working_matrix, &ncoefs, pivots, work, &lwork, &info);
if (info<0) { return std::make_pair(0, false); }
- // Log-determinant as sum of the log-diagonals, then halving (see below).
+ /* For triangular matrices, we need the diagonal to compute the determinant. Fortunately,
+ * this remains the diagonal of the output matrix despite the permutations performed by
+ * the pivoting. We sum over all log'd diagonal elements to get the log determinant of the
+ * information matrix (valid because det(LDL*)=det(L)*det(D)*det(L*) and we're using the
+ * result of the decomposition). We then have to halve the resulting value.
+ *
+ * Note the protection against zero. This just replaces it with an appropriately small
+ * non-zero value, if the diagnoal element is zero or NA. This is valid because the set
+ * of fitted values which are zero will be constant at all dispersions. Thus, any replacement
+ * value will eventually cancel out during interpolation to obtain the CRAPLE.
+ */
double sum_log_diagonals=0;
for (int i=0; i<ncoefs; ++i) {
const double& cur_val=working_matrix[i*ncoefs+i];
@@ -65,37 +83,4 @@ std::pair<double, bool> adj_coxreid::compute(const double* wptr) {
return std::make_pair(sum_log_diagonals*0.5, true);
}
-/* EXPLANATION:
-
- XtWX represents the expected Fisher information. The overall strategy is to compute the
- log-determinant of this matrix, to compute the adjustment factor for the likelihood (in
- order to account for uncertainty in the nuisance parameters i.e. the fitted values).
-
- We want to apply the Cholesky decomposition to the XtWX matrix. However, to be safe,
- we call the routine to do a symmetric indefinite factorisation i.e. A = LDLt. This
- guarantees factorization for singular matrices when the actual Cholesky decomposition
- would fail because it would start whining about non-positive eigenvectors.
-
- We then try to compute the determinant of the working_matrix. Here we use two facts:
-
- - For triangular matrices, the determinant is the product of the diagonals.
- - det(LDL*)=det(L)*det(D)*det(L*)
- - All diagonal elements of 'L' are unity.
-
- Thus, all we need to do is to we sum over all log'd diagonal elements in 'D', which -
- happily enough - are stored as the diagonal elements of 'working_matrix'. (And then
- divide by two, because that's just how the Cox-Reid adjustment works.)
-
- 'info > 0' indicates that one of the diagonals is zero. We handle this by replacing the
- it with an appropriately small non-zero value, if the diagnoal element is zero or NA. This
- is valid because the set of fitted values which are zero will be constant at all dispersions.
- Thus, any replacement value will eventually cancel out during interpolation to obtain the CRAPLE.
-
- Note that the LAPACK routine will also do some pivoting, essentially solving PAP* = LDL* for
- some permutation matrix P. This shouldn't affect anything; the determinant of the permutation
- is either 1 or -1, but this cancels out, so det(A) = det(PAP*).
-
- Further note that the routine can theoretically give block diagonals, but this should
- not occur for positive (semi)definite matrices, which is what XtWX should always be.
-*/
diff --git a/src/fmm_spline.c b/src/fmm_spline.c
index 318ce89..294e047 100644
--- a/src/fmm_spline.c
+++ b/src/fmm_spline.c
@@ -20,16 +20,13 @@
/* Comment from A. Lun:
* This is copied straight from 'splines.c' in the 'stats' package of
- * R's core installation (current as of r72293), with a few modifications:
- *
- * - commented out all functions except for fmm_spline.
- *
- * - commented out the error call in fmm_spline when fewer than 2 points
- * are supplied, as that is unnecessary for my code (and fails to compile
- * without a definition of EDOM).
- *
- * - const'ifed the 'x' and 'y' pointers to protect them from modification.
- */
+ * R's core installation. I have removed all functions except for
+ * fmm_spline. I have commented out the error call for the instance where
+ * less than 2 points are supplied, as that is unnecessary for my code
+ * (and fails to compile without a definition of EDOM). I've also const'ifed
+ * the 'x' and 'y' pointers to protect them from modification. Otherwise
+ * the function and comments have not been modified.
+ */
/* Spline Interpolation
* --------------------
diff --git a/src/glm.h b/src/glm.h
index 075a99a..2bc6cd8 100644
--- a/src/glm.h
+++ b/src/glm.h
@@ -5,13 +5,20 @@
#include "utils.h"
std::pair<double,bool> glm_one_group(const int&, const int&, const double&, const double*,
- const double*, const double*, const double*, double);
+#ifdef WEIGHTED
+ const double*,
+#endif
+ const double*, const double*, double);
class glm_levenberg {
public:
glm_levenberg(const int&, const int&, const double*, const int&, const double&);
~glm_levenberg();
- int fit(const double*, const double*, const double*, const double*, double*, double*);
+ int fit(const double*, const double*,
+#ifdef WEIGHTED
+ const double*,
+#endif
+ const double*, double*, double*);
const bool& is_failure() const;
const int& get_iterations() const;
@@ -35,7 +42,11 @@ private:
int iter;
bool failed;
- double nb_deviance(const double*, const double*, const double*, const double*) const;
+ double nb_deviance(const double*, const double*,
+#ifdef WEIGHTED
+ const double*,
+#endif
+ const double*) const;
void autofill(const double*, double*, const double*);
};
diff --git a/src/glm_levenberg.cpp b/src/glm_levenberg.cpp
index e7079c2..af11c37 100644
--- a/src/glm_levenberg.cpp
+++ b/src/glm_levenberg.cpp
@@ -1,24 +1,36 @@
#include "glm.h"
+
+/* Function to calculate the deviance. Note the protection for very large mu*phi (where we
+ * use a gamma instead) or very small mu*phi (where we use the Poisson instead). This
+ * approximation protects against numerical instability introduced by subtrackting
+ * a very large log value in (log cur_mu) with another very large logarithm (log cur_mu+1/phi).
+ * We need to consider the 'phi' as the approximation is only good when the product is
+ * very big or very small.
+ */
const double one_millionth=std::pow(10, -6.0);
const double supremely_low_value=std::pow(10, -13.0), ridiculously_low_value=std::pow(10, -100.0);
-double glm_levenberg::nb_deviance (const double* y, const double* mu, const double* w, const double* phi) const {
+double glm_levenberg::nb_deviance (const double* y, const double* mu,
+#ifdef WEIGHTED
+ const double* w,
+#endif
+ const double* phi) const {
double tempdev=0;
for (int i=0; i<nlibs; ++i) {
+#ifdef WEIGHTED
tempdev+=w[i]*compute_unit_nb_deviance(y[i], mu[i], phi[i]);
+#else
+ tempdev+=compute_unit_nb_deviance(y[i], mu[i], phi[i]);
+#endif
}
return tempdev;
}
-const char trans='n';
-const int incx=1, incy=1;
-const double first_scaling=1, second_scaling=1;
void glm_levenberg::autofill(const double* offset, double* mu, const double* beta) {
- std::copy(offset, offset+nlibs, mu);
- F77_CALL(dgemv)(&trans, &nlibs, &ncoefs, &first_scaling, design, &nlibs, beta, &incx, &second_scaling, mu, &incy);
for (int lib=0; lib<nlibs; ++lib) {
- double& cur_mean=mu[lib];
+ double& cur_mean=(mu[lib]=offset[lib]);
+ for (int coef=0; coef<ncoefs; ++coef) { cur_mean+=design[coef*nlibs+lib]*beta[coef]; }
cur_mean=std::exp(cur_mean);
}
return;
@@ -61,9 +73,12 @@ const char normal='n', transposed='t', uplo='U';
const double a=1, b=0;
const int nrhs=1;
-int glm_levenberg::fit(const double* offset, const double* y, const double* w,
+int glm_levenberg::fit(const double* offset, const double* y,
+#ifdef WEIGHTED
+ const double* w,
+#endif
const double* disp, double* mu, double* beta) {
- // We expect 'beta' to be supplied. We then check the maximum value of the counts.
+ // We expect 'mu' and 'beta' to be supplied. We then check the maximum value of the counts.
double ymax=0;
for (int lib=0; lib<nlibs; ++lib) {
const double& count=y[lib];
@@ -75,28 +90,24 @@ int glm_levenberg::fit(const double* offset, const double* y, const double* w,
// If we start off with all entries at zero, there's really no point continuing.
if (ymax<low_value) {
- std::fill(beta, beta+ncoefs, NA_REAL);
- std::fill(mu, mu+nlibs, 0);
+ for (int coef=0; coef<ncoefs; ++coef) { beta[coef]=NA_REAL; }
+ for (int lib=0; lib<nlibs; ++lib) { mu[lib]=0; }
return 0;
}
- // Otherwise, we compute 'mu' based on 'beta'. Returning if there are no coefficients!
+ /* Otherwise, we have to make sure 'beta' and 'mu' make sense relative to one another.
+ * We then proceed to iterating using reweighted least squares.
+ */
autofill(offset, mu, beta);
- dev=nb_deviance(y, mu, w, disp);
- if (ncoefs==0) {
- return 0;
- }
-
- // Iterating using reweighted least squares; setting up assorted temporary objects.
+ dev=nb_deviance(y, mu,
+#ifdef WEIGHTED
+ w,
+#endif
+ disp);
double max_info=-1, lambda=0;
- double denom, weight, deriv;
- int row, col, index;
- double divergence;
- int lev=0;
- bool low_dev=false;
while ((++iter) <= maxit) {
- std::fill(dl, dl+ncoefs, 0);
+ for (int i=0; i<ncoefs; ++i) { dl[i]=0; }
/* Here we set up the matrix XtWX i.e. the Fisher information matrix. X is the design matrix and W is a diagonal matrix
* with the working weights for each observation (i.e. library). The working weights are part of the first derivative of
@@ -111,23 +122,26 @@ int glm_levenberg::fit(const double* offset, const double* y, const double* w,
* is the second derivative, and dl is the first, you can see that we are effectively performing a multivariate
* Newton-Raphson procedure with 'dbeta' as the step.
*/
- for (row=0; row<nlibs; ++row) {
+ for (int row=0; row<nlibs; ++row) {
const double& cur_mu=mu[row];
- denom=(1+cur_mu*disp[row]);
- weight=cur_mu/denom*w[row];
- deriv=(y[row]-cur_mu)/denom*w[row];
-
- index=row;
- for (col=0; col<ncoefs; ++col) {
+ const double denom=(1+cur_mu*disp[row]);
+#ifdef WEIGHTED
+ const double weight=cur_mu/denom*w[row];
+ const double deriv=(y[row]-cur_mu)/denom*w[row];
+#else
+ const double weight=cur_mu/denom;
+ const double deriv=(y[row]-cur_mu)/denom;
+#endif
+ for (int col=0; col<ncoefs; ++col){
+ const int index=col*nlibs+row;
wx[index]=design[index]*weight;
dl[col]+=design[index]*deriv;
- index+=nlibs;
}
}
- F77_CALL(dgemm)(&transposed, &normal, &ncoefs, &ncoefs, &nlibs,
+ F77_NAME(dgemm)(&transposed, &normal, &ncoefs, &ncoefs, &nlibs,
&a, design, &nlibs, wx, &nlibs, &b, xwx, &ncoefs);
- for (row=0; row<ncoefs; ++row) {
- const double& cur_val=xwx[row*ncoefs+row];
+ for (int i=0; i<ncoefs; ++i) {
+ const double& cur_val=xwx[i*ncoefs+i];
if (cur_val>max_info) { max_info=cur_val; }
}
if (iter==1) {
@@ -139,28 +153,29 @@ int glm_levenberg::fit(const double* offset, const double* y, const double* w,
* step can be found that increases the deviance. In short, increases in the deviance
* are enforced to avoid problems with convergence.
*/
- lev=0;
- low_dev=false;
+ int lev=0;
+ bool low_dev=false;
while (++lev) {
- std::copy(dl, dl+ncoefs, dbeta);
-
+ for (int col=0; col<ncoefs; ++col) { dbeta[col]=dl[col]; } // Copying dl to dbeta.
do {
/* We need to set up copies as the decomposition routine overwrites the originals, and
* we want the originals in case we don't like the latest step. For efficiency, we only
* refer to the upper triangular for the XtWX copy (as it should be symmetrical). We also add
* 'lambda' to the diagonals. This reduces the step size as the second derivative is increased.
*/
- for (col=0; col<ncoefs; ++col) {
- index=col*ncoefs;
- std::copy(xwx+index, xwx+index+col+1, xwx_copy+index);
- xwx_copy[index+col]+=lambda;
+ for (int col=0; col<ncoefs; ++col) {
+ for (int row=0; row<=col; ++row) {
+ const int index=col*ncoefs+row;
+ xwx_copy[index]=xwx[index];
+ if (row==col) { xwx_copy[index]+=lambda; }
+ }
}
// Cholesky decomposition, and then use of the decomposition to solve for dbeta in (XtWX)dbeta = dl.
- F77_CALL(dpotrf)(&uplo, &ncoefs, xwx_copy, &ncoefs, &info);
+ F77_NAME(dpotrf)(&uplo, &ncoefs, xwx_copy, &ncoefs, &info);
if (info!=0) {
/* If it fails, it MUST mean that the matrix is singular due to numerical imprecision
- * as all the diagonal entries of the XtWX matrix must be positive. This occurs because of
+ * as all the diagonal entries of the XVX matrix must be positive. This occurs because of
* fitted values being exactly zero; thus, the coefficients attempt to converge to negative
* infinity. This generally forces the step size to be larger (i.e. lambda lower) in order to
* get to infinity faster (which is impossible). Low lambda leads to numerical instability
@@ -174,11 +189,11 @@ int glm_levenberg::fit(const double* offset, const double* y, const double* w,
} else { break; }
} while (1);
- F77_CALL(dpotrs)(&uplo, &ncoefs, &nrhs, xwx_copy, &ncoefs, dbeta, &ncoefs, &info);
+ F77_NAME(dpotrs)(&uplo, &ncoefs, &nrhs, xwx_copy, &ncoefs, dbeta, &ncoefs, &info);
if (info!=0) { return 1; }
// Updating beta and the means. 'dbeta' stores 'Y' from the solution of (X*VX)Y=dl, corresponding to a NR step.
- for (index=0; index<ncoefs; ++index) { beta_new[index]=beta[index]+dbeta[index]; }
+ for (int i=0; i<ncoefs; ++i) { beta_new[i]=beta[i]+dbeta[i]; }
autofill(offset, mu_new, beta_new);
/* Checking if the deviance has decreased or if it's too small to care about. Either case is good
@@ -187,11 +202,15 @@ int glm_levenberg::fit(const double* offset, const double* y, const double* w,
* lambda up so we want to retake the step from where we were before). This is why we don't modify the values
* in-place until we're sure we want to take the step.
*/
- const double dev_new=nb_deviance(y, mu_new, w, disp);
+ const double dev_new=nb_deviance(y, mu_new,
+#ifdef WEIGHTED
+ w,
+#endif
+ disp);
if (dev_new/ymax < supremely_low_value) { low_dev=true; }
if (dev_new <= dev || low_dev) {
- std::copy(beta_new, beta_new+ncoefs, beta);
- std::copy(mu_new, mu_new+nlibs, mu);
+ for (int i=0; i<ncoefs; ++i) { beta[i]=beta_new[i]; }
+ for (int i=0; i<nlibs; ++i) { mu[i]=mu_new[i]; }
dev=dev_new;
break;
}
@@ -213,8 +232,8 @@ int glm_levenberg::fit(const double* offset, const double* y, const double* w,
*/
if (failed) { break; }
if (low_dev) { break; }
- divergence=0;
- for (index=0; index<ncoefs; ++index) { divergence+=dl[index]*dbeta[index]; }
+ double divergence=0;
+ for (int i=0; i<ncoefs; ++i) { divergence+=dl[i]*dbeta[i]; }
if (divergence < tolerance) { break; }
/* If we quit the inner levenberg loop immediately and survived all the break conditions above, that means that deviance is decreasing
diff --git a/src/glm_one_group.cpp b/src/glm_one_group.cpp
index 91e6477..8bb6643 100644
--- a/src/glm_one_group.cpp
+++ b/src/glm_one_group.cpp
@@ -1,7 +1,10 @@
#include "glm.h"
std::pair<double,bool> glm_one_group(const int& nlibs, const int& maxit, const double& tolerance, const double* offset,
- const double* weights, const double* y, const double* disp, double cur_beta) {
+#ifdef WEIGHTED
+ const double* weights,
+#endif
+ const double* y, const double* disp, double cur_beta) {
/* Setting up initial values for beta as the log of the mean of the ratio of counts to offsets.
* This is the exact solution for the gamma distribution (which is the limit of the NB as
* the dispersion goes to infinity. However, if cur_beta is not NA, then we assume it's good.
@@ -9,14 +12,24 @@ std::pair<double,bool> glm_one_group(const int& nlibs, const int& maxit, const d
bool nonzero=false;
if (ISNA(cur_beta)) {
cur_beta=0;
+#ifdef WEIGHTED
double totweight=0;
+#else
+ double totweight=nlibs;
+#endif
for (int j=0; j<nlibs; ++j) {
const double& cur_val=y[j];
if (cur_val>low_value) {
+#ifdef WEIGHTED
cur_beta+=cur_val/std::exp(offset[j]) * weights[j];
+#else
+ cur_beta+=cur_val/std::exp(offset[j]);
+#endif
nonzero=true;
}
+#ifdef WEIGHTED
totweight+=weights[j];
+#endif
}
cur_beta=std::log(cur_beta/totweight);
} else {
@@ -36,8 +49,13 @@ std::pair<double,bool> glm_one_group(const int& nlibs, const int& maxit, const d
info=0;
for (int j=0; j<nlibs; ++j) {
const double mu=std::exp(cur_beta+offset[j]), denominator=1+mu*disp[j];
+#ifdef WEIGHTED
dl+=(y[j]-mu)/denominator * weights[j];
info+=mu/denominator * weights[j];
+#else
+ dl+=(y[j]-mu)/denominator;
+ info+=mu/denominator;
+#endif
}
const double step=dl/info;
cur_beta+=step;
diff --git a/src/init.cpp b/src/init.cpp
index 7f7ae78..94f6c9a 100644
--- a/src/init.cpp
+++ b/src/init.cpp
@@ -5,30 +5,14 @@
extern "C" {
static const R_CallMethodDef all_call_entries[] = {
- CALLDEF(R_compute_nbdev, 5),
- CALLDEF(R_compute_apl, 6),
+ CALLDEF(R_compute_nbdev, 3),
+ CALLDEF(R_cr_adjust, 3),
CALLDEF(R_exact_test_by_deviance, 5),
+ CALLDEF(R_levenberg, 11),
CALLDEF(R_loess_by_col, 4),
CALLDEF(R_maximize_interpolant, 2),
-
- CALLDEF(R_levenberg, 8),
- CALLDEF(R_get_levenberg_start, 6),
- CALLDEF(R_one_group, 7),
- CALLDEF(R_get_one_way_fitted, 3),
+ CALLDEF(R_one_group, 9),
CALLDEF(R_simple_good_turing, 3),
-
- CALLDEF(R_add_prior_count, 3),
- CALLDEF(R_calculate_cpm_log, 3),
- CALLDEF(R_calculate_cpm_raw, 2),
- CALLDEF(R_ave_log_cpm, 7),
-
- CALLDEF(R_check_counts, 1),
- CALLDEF(R_check_finite, 2),
- CALLDEF(R_check_positive, 2),
- CALLDEF(R_check_nonnegative, 2),
- CALLDEF(R_check_zero_fitted, 3),
- CALLDEF(R_check_poisson_bound, 3),
- CALLDEF(R_add_repeat_matrices, 4),
{NULL, NULL, 0}
};
diff --git a/src/matvec_check.cpp b/src/matvec_check.cpp
index 0ab9435..59a6593 100644
--- a/src/matvec_check.cpp
+++ b/src/matvec_check.cpp
@@ -1,142 +1,64 @@
#include "matvec_check.h"
-matvec_check::matvec_check(SEXP incoming, int nr, int nc) : nrow(nr), ncol(nc), mycheck(NULL), index(0), libdex(0), temp(NULL), toget(NULL) {
-
- SEXP dims=getAttrib(incoming, R_DimSymbol);
- if (!isInteger(dims) || LENGTH(dims)!=2) {
- throw std::runtime_error("matrix dimensions should be an integer vector of length 2");
- }
- const int NR=INTEGER(dims)[0], NC=INTEGER(dims)[1];
- if (LENGTH(incoming)!=NC*NR) {
- throw std::runtime_error("recorded matrix dimensions are not consistent with its length");
- }
-
- SEXP rep_r=getAttrib(incoming, install("repeat.row"));
- if (!isLogical(rep_r) || LENGTH(rep_r)!=1) {
- throw std::runtime_error("repeat_row specification must be a logical scalar");
- }
- repeat_row=asLogical(rep_r);
- if (!repeat_row) {
- if (NR!=nrow){
- throw std::runtime_error("matrix dimensions are not consistent for non-repeating number of rows");
- }
- } else if (NR!=1) {
- throw std::runtime_error("only one row should be present if it is repeating");
- }
- maxdex=NR;
-
- SEXP rep_c=getAttrib(incoming, install("repeat.col"));
- if (!isLogical(rep_c) || LENGTH(rep_c)!=1) {
- throw std::runtime_error("repeat_col specification must be a logical scalar");
- }
- repeat_col=asLogical(rep_c);
- if (!repeat_col) {
- if (NC!=ncol){
- throw std::runtime_error("matrix dimensions are not consistent for non-repeating number of columns");
- }
- } else if (NC!=1) {
- throw std::runtime_error("only one column should be present if it is repeating");
- }
-
- if (!isReal(incoming)) {
- throw std::runtime_error("matrix should be double-precision");
- }
+matvec_check::matvec_check(const int nlib, const int nlen, SEXP incoming, const bool transposed,
+ const char* err, const double nullfill) : mycheck(NULL), temp(NULL), isvec(true), istran(transposed),
+ nl(nlib), nt(nlen), tagdex(0), libdex(0) {
+
+ std::stringstream failed;
+ if (!isNumeric(incoming)) {
+ failed << err << " vector or matrix should be double precision";
+ throw std::runtime_error(failed.str());
+ }
+
+ // Checking if it is a vector, matrix or transposed matrix.
mycheck=REAL(incoming);
- temp=new double[ncol];
-
- try {
- // Setting the initial value of 'temp'.
- if (repeat_row) {
- if (repeat_col) {
- std::fill(temp, temp+ncol, *mycheck);
- } else {
- temp=new double[ncol];
- std::copy(mycheck, mycheck+ncol, temp);
- }
- } else {
- advance();
- }
- } catch (std::exception& e) {
- delete [] temp;
- throw;
- }
-
+ const int curlen=LENGTH(incoming);
+ if (curlen==0) {
+ temp=new double[nl];
+ for (int i=0; i<nl; ++i) { temp[i]=nullfill; }
+ mycheck=temp;
+ } else if (curlen!=nl) {
+ isvec=false;
+ if (LENGTH(incoming)!=nl*nlen) {
+ failed << "dimensions of " << err << " matrix are not consistent with number of libraries and tags";
+ throw std::runtime_error(failed.str());
+ }
+ if (!istran) {
+ temp=new double[nl];
+ libdex=0;
+ for (int i=0; i<nl; ++i, libdex+=nt) { temp[i]=mycheck[libdex]; }
+ }
+ } else {
+ // Otherwise, it's all good; we can use the pointer directly if it's a vector.
+ ;
+ }
return;
}
void matvec_check::advance() {
- if (repeat_row || index>=maxdex) {
- // No need for updating, if repeated across rows; or if we're at the end.
- return;
- }
- if (repeat_col) {
- // Updating for new row.
- std::fill(temp, temp+ncol, mycheck[index]);
- } else {
- // Updating for new row/col.
- toget=mycheck + index;
- for (libdex=0; libdex<ncol; ++libdex) {
- temp[libdex]=*toget;
- toget+=nrow;
- }
- }
- ++index;
- return;
-}
-
-const double* const matvec_check::access() const {
- return temp;
-}
-
-const bool matvec_check::is_row_repeated () const {
- return repeat_row;
+ if (!isvec) {
+ if (!istran) {
+ // Copying elements to an array if it is not transposed, so each row can be accessed at a pointer.
+ ++mycheck;
+ if ((++tagdex) >= nt) { return; }
+ libdex=0;
+ for (int i=0; i<nl; ++i, libdex+=nt) { temp[i]=mycheck[libdex]; }
+ } else {
+ // Each (original) row is a (transposed) column, so rows can be accessed directly in column-major format.
+ mycheck+=nl;
+ }
+ }
+ return;
}
-const bool matvec_check::is_col_repeated () const {
- return repeat_col;
+const double* const* matvec_check::access() const {
+ if (isvec || istran) {
+ return &mycheck;
+ } else {
+ return &temp;
+ }
}
matvec_check::~matvec_check() {
if (temp!=NULL) { delete [] temp; }
}
-
-count_holder::count_holder (SEXP y) : yiptr(NULL), ydptr(NULL) {
- SEXP dims=getAttrib(y, R_DimSymbol);
- if (!isInteger(dims) || LENGTH(dims)!=2) {
- throw std::runtime_error("matrix dimensions should be an integer vector of length 2");
- }
- num_tags=INTEGER(dims)[0];
- num_libs=INTEGER(dims)[1];
-
- is_integer=isInteger(y);
- if (is_integer) {
- yiptr=INTEGER(y);
- } else if (isReal(y)) {
- ydptr=REAL(y);
- } else {
- throw std::runtime_error("count matrix must be integer or double-precision");
- }
-
- if (LENGTH(y)!=num_tags*num_libs) {
- throw std::runtime_error("dimensions of the count matrix are not as specified");
- }
- return;
-}
-
-void count_holder::fill_and_next(double* yptr) {
- if (is_integer) {
- for (libdex=0; libdex<num_libs; ++libdex) { yptr[libdex]=double(yiptr[libdex*num_tags]); }
- ++yiptr;
- } else {
- for (libdex=0; libdex<num_libs; ++libdex) { yptr[libdex]=ydptr[libdex*num_tags]; }
- ++ydptr;
- }
- return;
-}
-
-bool count_holder::is_data_integer() const { return is_integer; }
-const int* count_holder::get_raw_int () const { return yiptr; }
-const double* count_holder::get_raw_double () const { return ydptr; }
-
-int count_holder::get_ntags() const { return num_tags; }
-int count_holder::get_nlibs() const { return num_libs; }
diff --git a/src/matvec_check.h b/src/matvec_check.h
index 623f561..657c586 100644
--- a/src/matvec_check.h
+++ b/src/matvec_check.h
@@ -4,38 +4,16 @@
class matvec_check {
public:
- matvec_check(SEXP, int, int);
+ matvec_check(const int, const int, SEXP, const bool, const char*, const double=0);
~matvec_check();
void advance();
- const double* const access() const;
- const bool is_row_repeated() const;
- const bool is_col_repeated() const;
+ const double* const* access() const;
private:
- int nrow, ncol;
- bool repeat_row, repeat_col;
-
const double* mycheck;
- int index, libdex, maxdex;
double* temp;
- const double* toget;
-};
-
-class count_holder {
-public:
- count_holder(SEXP);
- void fill_and_next(double*);
- int get_ntags() const;
- int get_nlibs() const;
-
- bool is_data_integer() const;
- const int* get_raw_int() const;
- const double* get_raw_double() const;
-private:
- const int* yiptr;
- const double* ydptr;
- int num_tags, num_libs;
- bool is_integer;
- int libdex;
+ bool isvec, istran;
+ const int nl, nt;
+ int tagdex, libdex;
};
#endif
diff --git a/src/nbdev.cpp b/src/nbdev.cpp
index e5fb367..b3f7837 100644
--- a/src/nbdev.cpp
+++ b/src/nbdev.cpp
@@ -8,7 +8,7 @@
* very big or very small.
*/
-const double one_tenthousandth=std::pow(10, -4.0);
+const double one_million=std::pow(10, 6.0), one_tenthousandth=std::pow(10, -4.0);
const double mildly_low_value=std::pow(10, -8.0);
double compute_unit_nb_deviance (double y, double mu, const double& phi) {
diff --git a/src/utils.h b/src/utils.h
index a483a80..69aaa5c 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -10,7 +10,6 @@
#include <deque>
#include <stdexcept>
#include <sstream>
-#include <algorithm>
#include "R.h"
#include "Rinternals.h"
@@ -19,51 +18,28 @@
#include "R_ext/BLAS.h"
#include "R_ext/Lapack.h"
+const double low_value=std::pow(10.0, -10.0), log_low_value=std::log(low_value);
+
/* Defining all R-accessible functions. */
extern "C" {
-SEXP R_compute_nbdev(SEXP, SEXP, SEXP, SEXP, SEXP);
+SEXP R_compute_nbdev(SEXP, SEXP, SEXP);
-SEXP R_compute_apl (SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
+SEXP R_cr_adjust (SEXP, SEXP, SEXP);
SEXP R_exact_test_by_deviance(SEXP, SEXP, SEXP, SEXP, SEXP);
-SEXP R_levenberg (SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
-
-SEXP R_get_levenberg_start (SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
+SEXP R_levenberg (SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
SEXP R_loess_by_col(SEXP, SEXP, SEXP, SEXP);
SEXP R_maximize_interpolant(SEXP, SEXP);
-SEXP R_one_group (SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
-
-SEXP R_get_one_way_fitted (SEXP, SEXP, SEXP);
+SEXP R_one_group (SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
SEXP R_simple_good_turing (SEXP, SEXP, SEXP);
-SEXP R_add_prior_count (SEXP, SEXP, SEXP);
-
-SEXP R_calculate_cpm_log (SEXP, SEXP, SEXP);
-
-SEXP R_calculate_cpm_raw (SEXP, SEXP);
-
-SEXP R_ave_log_cpm(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
-
-SEXP R_check_counts (SEXP);
-
-SEXP R_check_finite (SEXP, SEXP);
-
-SEXP R_check_positive (SEXP, SEXP);
-
-SEXP R_check_nonnegative (SEXP, SEXP);
-
-SEXP R_check_zero_fitted (SEXP, SEXP, SEXP);
-
-SEXP R_check_poisson_bound (SEXP, SEXP, SEXP);
-
-SEXP R_add_repeat_matrices(SEXP, SEXP, SEXP, SEXP);
void processHairpinReads(int *, int *, char**, char**, int*,
char**, char**, int*, int*, int*, int*, int*, int*,
@@ -72,10 +48,4 @@ void processHairpinReads(int *, int *, char**, char**, int*,
}
-/* Other utility functions and values */
-
-const double low_value=std::pow(10.0, -10.0), log_low_value=std::log(low_value);
-
-const double LNtwo=std::log(2), one_million=1000000, LNmillion=std::log(one_million);
-
#endif
diff --git a/tests/edgeR-Tests.Rout.save b/tests/edgeR-Tests.Rout.save
index b9c36ec..9c951fb 100644
--- a/tests/edgeR-Tests.Rout.save
+++ b/tests/edgeR-Tests.Rout.save
@@ -1,540 +1,539 @@
-
-R version 3.3.3 (2017-03-06) -- "Another Canoe"
-Copyright (C) 2017 The R Foundation for Statistical Computing
-Platform: x86_64-w64-mingw32/x64 (64-bit)
-
-R is free software and comes with ABSOLUTELY NO WARRANTY.
-You are welcome to redistribute it under certain conditions.
-Type 'license()' or 'licence()' for distribution details.
-
- Natural language support but running in an English locale
-
-R is a collaborative project with many contributors.
-Type 'contributors()' for more information and
-'citation()' on how to cite R or R packages in publications.
-
-Type 'demo()' for some demos, 'help()' for on-line help, or
-'help.start()' for an HTML browser interface to help.
-Type 'q()' to quit R.
-
-> library(edgeR)
-Loading required package: limma
->
-> set.seed(0); u <- runif(100)
->
-> # generate raw counts from NB, create list object
-> y <- matrix(rnbinom(80,size=5,mu=10),nrow=20)
-> y <- rbind(0,c(0,0,2,2),y)
-> rownames(y) <- paste("Tag",1:nrow(y),sep=".")
-> d <- DGEList(counts=y,group=rep(1:2,each=2),lib.size=1001:1004)
->
-> # estimate common dispersion and find differences in expression
-> d <- estimateCommonDisp(d)
-> d$common.dispersion
-[1] 0.210292
-> de <- exactTest(d)
-> summary(de$table)
- logFC logCPM PValue
- Min. :-1.7266 Min. :10.96 Min. :0.01976
- 1st Qu.:-0.4855 1st Qu.:13.21 1st Qu.:0.33120
- Median : 0.2253 Median :13.37 Median :0.56514
- Mean : 0.1877 Mean :13.26 Mean :0.54504
- 3rd Qu.: 0.5258 3rd Qu.:13.70 3rd Qu.:0.81052
- Max. : 4.0861 Max. :14.31 Max. :1.00000
-> topTags(de)
-Comparison of groups: 2-1
- logFC logCPM PValue FDR
-Tag.17 2.0450964 13.73726 0.01975954 0.4347099
-Tag.21 -1.7265870 13.38327 0.06131012 0.6744114
-Tag.6 -1.6329986 12.81479 0.12446044 0.8982100
-Tag.2 4.0861092 11.54121 0.16331090 0.8982100
-Tag.16 0.9324996 13.57074 0.29050785 0.9655885
-Tag.20 0.8543138 13.76364 0.31736609 0.9655885
-Tag.12 0.7081170 14.31389 0.37271028 0.9655885
-Tag.19 -0.7976602 13.31405 0.40166354 0.9655885
-Tag.3 -0.7300410 13.54155 0.42139935 0.9655885
-Tag.8 -0.7917906 12.86353 0.47117217 0.9655885
->
-> d2 <- estimateTagwiseDisp(d,trend="none",prior.df=20)
-> summary(d2$tagwise.dispersion)
- Min. 1st Qu. Median Mean 3rd Qu. Max.
- 0.1757 0.1896 0.1989 0.2063 0.2185 0.2677
-> de <- exactTest(d2,dispersion="common")
-> topTags(de)
-Comparison of groups: 2-1
- logFC logCPM PValue FDR
-Tag.17 2.0450964 13.73726 0.01975954 0.4347099
-Tag.21 -1.7265870 13.38327 0.06131012 0.6744114
-Tag.6 -1.6329986 12.81479 0.12446044 0.8982100
-Tag.2 4.0861092 11.54121 0.16331090 0.8982100
-Tag.16 0.9324996 13.57074 0.29050785 0.9655885
-Tag.20 0.8543138 13.76364 0.31736609 0.9655885
-Tag.12 0.7081170 14.31389 0.37271028 0.9655885
-Tag.19 -0.7976602 13.31405 0.40166354 0.9655885
-Tag.3 -0.7300410 13.54155 0.42139935 0.9655885
-Tag.8 -0.7917906 12.86353 0.47117217 0.9655885
->
-> de <- exactTest(d2)
-> topTags(de)
-Comparison of groups: 2-1
- logFC logCPM PValue FDR
-Tag.17 2.0450987 13.73726 0.01327001 0.2919403
-Tag.21 -1.7265897 13.38327 0.05683886 0.6252275
-Tag.6 -1.6329910 12.81479 0.11460208 0.8404152
-Tag.2 4.0861092 11.54121 0.16126207 0.8869414
-Tag.16 0.9324975 13.57074 0.28103256 0.9669238
-Tag.20 0.8543178 13.76364 0.30234789 0.9669238
-Tag.12 0.7081149 14.31389 0.37917895 0.9669238
-Tag.19 -0.7976633 13.31405 0.40762735 0.9669238
-Tag.3 -0.7300478 13.54155 0.40856822 0.9669238
-Tag.8 -0.7918243 12.86353 0.49005179 0.9669238
->
-> d2 <- estimateTagwiseDisp(d,trend="movingave",span=0.4,prior.df=20)
-> summary(d2$tagwise.dispersion)
- Min. 1st Qu. Median Mean 3rd Qu. Max.
- 0.1005 0.1629 0.2064 0.2077 0.2585 0.3164
-> de <- exactTest(d2)
-> topTags(de)
-Comparison of groups: 2-1
- logFC logCPM PValue FDR
-Tag.17 2.0450951 13.73726 0.02427872 0.5341319
-Tag.21 -1.7265927 13.38327 0.05234833 0.5758316
-Tag.6 -1.6330014 12.81479 0.12846308 0.8954397
-Tag.2 4.0861092 11.54121 0.16280722 0.8954397
-Tag.16 0.9324887 13.57074 0.24308201 0.9711975
-Tag.20 0.8543044 13.76364 0.35534649 0.9711975
-Tag.19 -0.7976535 13.31405 0.38873717 0.9711975
-Tag.3 -0.7300525 13.54155 0.40001438 0.9711975
-Tag.12 0.7080985 14.31389 0.43530227 0.9711975
-Tag.8 -0.7918376 12.86353 0.49782701 0.9711975
->
-> summary(exactTest(d2,rejection="smallp")$table$PValue)
- Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.02428 0.36370 0.55660 0.54320 0.78890 1.00000
-> summary(exactTest(d2,rejection="deviance")$table$PValue)
- Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.02428 0.36370 0.55660 0.54320 0.78890 1.00000
->
-> d2 <- estimateTagwiseDisp(d,trend="loess",span=0.8,prior.df=20)
-> summary(d2$tagwise.dispersion)
- Min. 1st Qu. Median Mean 3rd Qu. Max.
- 0.1165 0.1449 0.1832 0.1848 0.2116 0.2825
-> de <- exactTest(d2)
-> topTags(de)
-Comparison of groups: 2-1
- logFC logCPM PValue FDR
-Tag.17 2.0450979 13.73726 0.01546795 0.3402949
-Tag.21 -1.7266049 13.38327 0.03545446 0.3899990
-Tag.6 -1.6329841 12.81479 0.10632987 0.7797524
-Tag.2 4.0861092 11.54121 0.16057893 0.8831841
-Tag.16 0.9324935 13.57074 0.26348818 0.9658389
-Tag.20 0.8543140 13.76364 0.31674090 0.9658389
-Tag.19 -0.7976354 13.31405 0.35564858 0.9658389
-Tag.3 -0.7300593 13.54155 0.38833737 0.9658389
-Tag.12 0.7081041 14.31389 0.41513004 0.9658389
-Tag.8 -0.7918152 12.86353 0.48483449 0.9658389
->
-> d2 <- estimateTagwiseDisp(d,trend="tricube",span=0.8,prior.df=20)
-> summary(d2$tagwise.dispersion)
- Min. 1st Qu. Median Mean 3rd Qu. Max.
- 0.1165 0.1449 0.1832 0.1848 0.2116 0.2825
-> de <- exactTest(d2)
-> topTags(de)
-Comparison of groups: 2-1
- logFC logCPM PValue FDR
-Tag.17 2.0450979 13.73726 0.01546795 0.3402949
-Tag.21 -1.7266049 13.38327 0.03545446 0.3899990
-Tag.6 -1.6329841 12.81479 0.10632987 0.7797524
-Tag.2 4.0861092 11.54121 0.16057893 0.8831841
-Tag.16 0.9324935 13.57074 0.26348818 0.9658389
-Tag.20 0.8543140 13.76364 0.31674090 0.9658389
-Tag.19 -0.7976354 13.31405 0.35564858 0.9658389
-Tag.3 -0.7300593 13.54155 0.38833737 0.9658389
-Tag.12 0.7081041 14.31389 0.41513004 0.9658389
-Tag.8 -0.7918152 12.86353 0.48483449 0.9658389
->
-> # mglmOneWay
-> design <- model.matrix(~group,data=d$samples)
-> mglmOneWay(d[1:10,],design,dispersion=0.2)
-$coefficients
- (Intercept) group2
- [1,] -1.000000e+08 0.000000e+00
- [2,] -1.000000e+08 1.000000e+08
- [3,] 2.525729e+00 -5.108256e-01
- [4,] 2.525729e+00 1.484200e-01
- [5,] 2.140066e+00 -1.941560e-01
- [6,] 2.079442e+00 -1.163151e+00
- [7,] 2.014903e+00 2.363888e-01
- [8,] 1.945910e+00 -5.596158e-01
- [9,] 1.504077e+00 2.006707e-01
-[10,] 2.302585e+00 2.623643e-01
-
-$fitted.values
- [,1] [,2] [,3] [,4]
- [1,] 0.0 0.0 0.0 0.0
- [2,] 0.0 0.0 2.0 2.0
- [3,] 12.5 12.5 7.5 7.5
- [4,] 12.5 12.5 14.5 14.5
- [5,] 8.5 8.5 7.0 7.0
- [6,] 8.0 8.0 2.5 2.5
- [7,] 7.5 7.5 9.5 9.5
- [8,] 7.0 7.0 4.0 4.0
- [9,] 4.5 4.5 5.5 5.5
-[10,] 10.0 10.0 13.0 13.0
-
-> mglmOneWay(d[1:10,],design,dispersion=0)
-$coefficients
- (Intercept) group2
- [1,] -1.000000e+08 0.000000e+00
- [2,] -1.000000e+08 1.000000e+08
- [3,] 2.525729e+00 -5.108256e-01
- [4,] 2.525729e+00 1.484200e-01
- [5,] 2.140066e+00 -1.941560e-01
- [6,] 2.079442e+00 -1.163151e+00
- [7,] 2.014903e+00 2.363888e-01
- [8,] 1.945910e+00 -5.596158e-01
- [9,] 1.504077e+00 2.006707e-01
-[10,] 2.302585e+00 2.623643e-01
-
-$fitted.values
- [,1] [,2] [,3] [,4]
- [1,] 0.0 0.0 0.0 0.0
- [2,] 0.0 0.0 2.0 2.0
- [3,] 12.5 12.5 7.5 7.5
- [4,] 12.5 12.5 14.5 14.5
- [5,] 8.5 8.5 7.0 7.0
- [6,] 8.0 8.0 2.5 2.5
- [7,] 7.5 7.5 9.5 9.5
- [8,] 7.0 7.0 4.0 4.0
- [9,] 4.5 4.5 5.5 5.5
-[10,] 10.0 10.0 13.0 13.0
-
->
-> fit <- glmFit(d,design,dispersion=d$common.dispersion,prior.count=0.5/4)
-> lrt <- glmLRT(fit,coef=2)
-> topTags(lrt)
-Coefficient: group2
- logFC logCPM LR PValue FDR
-Tag.17 2.0450964 13.73726 6.0485417 0.01391779 0.3058698
-Tag.2 4.0861092 11.54121 4.8400340 0.02780635 0.3058698
-Tag.21 -1.7265870 13.38327 4.0777825 0.04345065 0.3186381
-Tag.6 -1.6329986 12.81479 3.0078205 0.08286364 0.4557500
-Tag.16 0.9324996 13.57074 1.3477682 0.24566867 0.8276702
-Tag.20 0.8543138 13.76364 1.1890032 0.27553071 0.8276702
-Tag.19 -0.7976602 13.31405 0.9279151 0.33540526 0.8276702
-Tag.12 0.7081170 14.31389 0.9095513 0.34023349 0.8276702
-Tag.3 -0.7300410 13.54155 0.8300307 0.36226364 0.8276702
-Tag.8 -0.7917906 12.86353 0.7830377 0.37621371 0.8276702
->
-> fit <- glmFit(d,design,dispersion=d$common.dispersion,prior.count=0.5)
-> summary(fit$coef)
- (Intercept) group2
- Min. :-7.604 Min. :-1.13681
- 1st Qu.:-4.895 1st Qu.:-0.32341
- Median :-4.713 Median : 0.15083
- Mean :-4.940 Mean : 0.07817
- 3rd Qu.:-4.524 3rd Qu.: 0.35163
- Max. :-4.107 Max. : 1.60864
->
-> fit <- glmFit(d,design,prior.count=0.5/4)
-> lrt <- glmLRT(fit,coef=2)
-> topTags(lrt)
-Coefficient: group2
- logFC logCPM LR PValue FDR
-Tag.17 2.0450964 13.73726 6.0485417 0.01391779 0.3058698
-Tag.2 4.0861092 11.54121 4.8400340 0.02780635 0.3058698
-Tag.21 -1.7265870 13.38327 4.0777825 0.04345065 0.3186381
-Tag.6 -1.6329986 12.81479 3.0078205 0.08286364 0.4557500
-Tag.16 0.9324996 13.57074 1.3477682 0.24566867 0.8276702
-Tag.20 0.8543138 13.76364 1.1890032 0.27553071 0.8276702
-Tag.19 -0.7976602 13.31405 0.9279151 0.33540526 0.8276702
-Tag.12 0.7081170 14.31389 0.9095513 0.34023349 0.8276702
-Tag.3 -0.7300410 13.54155 0.8300307 0.36226364 0.8276702
-Tag.8 -0.7917906 12.86353 0.7830377 0.37621371 0.8276702
->
-> dglm <- estimateGLMCommonDisp(d,design)
-> dglm$common.dispersion
-[1] 0.2033282
-> dglm <- estimateGLMTagwiseDisp(dglm,design,prior.df=20)
-> summary(dglm$tagwise.dispersion)
- Min. 1st Qu. Median Mean 3rd Qu. Max.
- 0.1756 0.1879 0.1998 0.2031 0.2135 0.2578
-> fit <- glmFit(dglm,design,prior.count=0.5/4)
-> lrt <- glmLRT(fit,coef=2)
-> topTags(lrt)
-Coefficient: group2
- logFC logCPM LR PValue FDR
-Tag.17 2.0450988 13.73727 6.8001118 0.009115216 0.2005348
-Tag.2 4.0861092 11.54122 4.8594088 0.027495756 0.2872068
-Tag.21 -1.7265904 13.38327 4.2537154 0.039164558 0.2872068
-Tag.6 -1.6329904 12.81479 3.1763761 0.074710253 0.4109064
-Tag.16 0.9324970 13.57074 1.4126709 0.234613512 0.8499599
-Tag.20 0.8543183 13.76364 1.2721097 0.259371274 0.8499599
-Tag.19 -0.7976614 13.31405 0.9190392 0.337727381 0.8499599
-Tag.12 0.7081163 14.31389 0.9014515 0.342392806 0.8499599
-Tag.3 -0.7300488 13.54155 0.8817937 0.347710872 0.8499599
-Tag.8 -0.7918166 12.86353 0.7356185 0.391068049 0.8603497
-> dglm <- estimateGLMTrendedDisp(dglm,design)
-> summary(dglm$trended.dispersion)
- Min. 1st Qu. Median Mean 3rd Qu. Max.
- 0.1522 0.1676 0.1740 0.1887 0.2000 0.3469
-> dglm <- estimateGLMTrendedDisp(dglm,design,method="power")
-> summary(dglm$trended.dispersion)
- Min. 1st Qu. Median Mean 3rd Qu. Max.
- 0.1522 0.1676 0.1740 0.1887 0.2000 0.3469
-> dglm <- estimateGLMTrendedDisp(dglm,design,method="spline")
-> summary(dglm$trended.dispersion)
- Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.09353 0.11080 0.15460 0.19010 0.23050 0.52010
-> dglm <- estimateGLMTrendedDisp(dglm,design,method="bin.spline")
-> summary(dglm$trended.dispersion)
- Min. 1st Qu. Median Mean 3rd Qu. Max.
- 0.1997 0.1997 0.1997 0.1997 0.1997 0.1997
-> dglm <- estimateGLMTagwiseDisp(dglm,design,prior.df=20)
-> summary(dglm$tagwise.dispersion)
- Min. 1st Qu. Median Mean 3rd Qu. Max.
- 0.1385 0.1792 0.1964 0.1935 0.2026 0.2709
->
-> dglm2 <- estimateDisp(dglm, design)
-> summary(dglm2$tagwise.dispersion)
- Min. 1st Qu. Median Mean 3rd Qu. Max.
- 0.1423 0.1618 0.1788 0.1863 0.2015 0.2692
-> dglm2 <- estimateDisp(dglm, design, prior.df=20)
-> summary(dglm2$tagwise.dispersion)
- Min. 1st Qu. Median Mean 3rd Qu. Max.
- 0.1527 0.1669 0.1814 0.1858 0.1951 0.2497
-> dglm2 <- estimateDisp(dglm, design, robust=TRUE)
-> summary(dglm2$tagwise.dispersion)
- Min. 1st Qu. Median Mean 3rd Qu. Max.
- 0.1423 0.1605 0.1783 0.1867 0.2031 0.2740
->
-> # Continuous trend
-> nlibs <- 3
-> ntags <- 1000
-> dispersion.true <- 0.1
-> # Make first transcript respond to covariate x
-> x <- 0:2
-> design <- model.matrix(~x)
-> beta.true <- cbind(Beta1=2,Beta2=c(2,rep(0,ntags-1)))
-> mu.true <- 2^(beta.true %*% t(design))
-> # Generate count data
-> y <- rnbinom(ntags*nlibs,mu=mu.true,size=1/dispersion.true)
-> y <- matrix(y,ntags,nlibs)
-> colnames(y) <- c("x0","x1","x2")
-> rownames(y) <- paste("Gene",1:ntags,sep="")
-> d <- DGEList(y)
-> d <- calcNormFactors(d)
-> fit <- glmFit(d, design, dispersion=dispersion.true, prior.count=0.5/3)
-> results <- glmLRT(fit, coef=2)
-> topTags(results)
-Coefficient: x
- logFC logCPM LR PValue FDR
-Gene1 2.907024 13.56183 38.738512 4.845536e-10 4.845536e-07
-Gene61 2.855317 10.27136 10.738307 1.049403e-03 5.247015e-01
-Gene62 -2.123902 10.53174 8.818703 2.981585e-03 8.334760e-01
-Gene134 -1.949073 10.53355 8.125889 4.363759e-03 8.334760e-01
-Gene740 -1.610046 10.94907 8.013408 4.643227e-03 8.334760e-01
-Gene354 2.022698 10.45066 7.826308 5.149118e-03 8.334760e-01
-Gene5 1.856816 10.45249 7.214238 7.232750e-03 8.334760e-01
-Gene746 -1.798331 10.53094 6.846262 8.882693e-03 8.334760e-01
-Gene110 1.623148 10.68607 6.737984 9.438120e-03 8.334760e-01
-Gene383 1.637140 10.75412 6.687530 9.708965e-03 8.334760e-01
-> d1 <- estimateGLMCommonDisp(d, design, verbose=TRUE)
-Disp = 0.10253 , BCV = 0.3202
-> glmFit(d,design,dispersion=dispersion.true, prior.count=0.5/3)
-An object of class "DGEGLM"
-$coefficients
- (Intercept) x
-Gene1 -7.391745 2.0149958
-Gene2 -7.318483 -0.7611895
-Gene3 -6.831702 -0.1399478
-Gene4 -7.480255 0.5172002
-Gene5 -8.747793 1.2870467
-995 more rows ...
-
-$fitted.values
- x0 x1 x2
-Gene1 2.3570471 18.954454 138.2791328
-Gene2 2.5138172 1.089292 0.4282107
-Gene3 4.1580452 3.750528 3.0690081
-Gene4 2.1012460 3.769592 6.1349937
-Gene5 0.5080377 2.136398 8.1502486
-995 more rows ...
-
-$deviance
-[1] 6.38037545 1.46644913 1.38532340 0.01593969 1.03894513
-995 more elements ...
-
-$iter
-[1] 8 4 4 4 6
-995 more elements ...
-
-$failed
-[1] FALSE FALSE FALSE FALSE FALSE
-995 more elements ...
-
-$method
-[1] "levenberg"
-
-$counts
- x0 x1 x2
-Gene1 0 30 110
-Gene2 2 2 0
-Gene3 3 6 2
-Gene4 2 4 6
-Gene5 1 1 9
-995 more rows ...
-
-$unshrunk.coefficients
- (Intercept) x
-Gene1 -7.437763 2.0412762
-Gene2 -7.373370 -0.8796273
-Gene3 -6.870127 -0.1465014
-Gene4 -7.552642 0.5410832
-Gene5 -8.972372 1.3929679
-995 more rows ...
-
-$df.residual
-[1] 1 1 1 1 1
-995 more elements ...
-
-$design
- (Intercept) x
-1 1 0
-2 1 1
-3 1 2
-attr(,"assign")
-[1] 0 1
-
-$offset
- [,1] [,2] [,3]
-x 8.295172 8.338525 8.284484
-attr(,"class")
-[1] "compressedMatrix"
-attr(,"repeat.row")
-[1] TRUE
-attr(,"repeat.col")
-[1] FALSE
-
-$dispersion
-[1] 0.1
-
-$prior.count
-[1] 0.1666667
-
-$samples
- group lib.size norm.factors
-x0 1 4001 1.0008730
-x1 1 4176 1.0014172
-x2 1 3971 0.9977138
-
-$AveLogCPM
-[1] 13.561832 9.682757 10.447014 10.532113 10.452489
-995 more elements ...
-
->
-> d2 <- estimateDisp(d, design)
-> summary(d2$tagwise.dispersion)
- Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.05545 0.09511 0.11620 0.11010 0.13330 0.16860
-> d2 <- estimateDisp(d, design, prior.df=20)
-> summary(d2$tagwise.dispersion)
- Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.04203 0.08586 0.11280 0.11010 0.12370 0.37410
-> d2 <- estimateDisp(d, design, robust=TRUE)
-> summary(d2$tagwise.dispersion)
- Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.05545 0.09511 0.11620 0.11010 0.13330 0.16860
->
-> # Exact tests
-> y <- matrix(rnbinom(20,mu=10,size=3/2),5,4)
-> group <- factor(c(1,1,2,2))
-> ys <- splitIntoGroupsPseudo(y,group,pair=c(1,2))
-> exactTestDoubleTail(ys$y1,ys$y2,dispersion=2/3)
-[1] 0.1334396 0.6343568 0.7280015 0.7124912 0.3919258
->
-> y <- matrix(rnbinom(5*7,mu=10,size=3/2),5,7)
-> group <- factor(c(1,1,2,2,3,3,3))
-> ys <- splitIntoGroupsPseudo(y,group,pair=c(1,3))
-> exactTestDoubleTail(ys$y1,ys$y2,dispersion=2/3)
-[1] 1.0000000 0.4486382 1.0000000 0.9390317 0.4591241
-> exactTestBetaApprox(ys$y1,ys$y2,dispersion=2/3)
-[1] 1.0000000 0.4492969 1.0000000 0.9421695 0.4589194
->
-> y[1,3:4] <- 0
-> design <- model.matrix(~group)
-> fit <- glmFit(y,design,dispersion=2/3,prior.count=0.5/7)
-> summary(fit$coef)
- (Intercept) group2 group3
- Min. :-1.817 Min. :-5.0171 Min. :-0.64646
- 1st Qu.:-1.812 1st Qu.:-1.1565 1st Qu.:-0.13919
- Median :-1.712 Median : 0.1994 Median :-0.10441
- Mean :-1.625 Mean :-0.9523 Mean :-0.04217
- 3rd Qu.:-1.429 3rd Qu.: 0.3755 3rd Qu.:-0.04305
- Max. :-1.356 Max. : 0.8374 Max. : 0.72227
->
-> lrt <- glmLRT(fit,contrast=cbind(c(0,1,0),c(0,0,1)))
-> topTags(lrt)
-Coefficient: LR test on 2 degrees of freedom
- logFC.1 logFC.2 logCPM LR PValue FDR
-1 -7.2381060 -0.0621100 17.19071 10.7712171 0.004582051 0.02291026
-5 -1.6684268 -0.9326507 17.33529 1.7309951 0.420842115 0.90967967
-2 1.2080938 1.0420198 18.24544 1.0496688 0.591653347 0.90967967
-4 0.5416704 -0.1506381 17.57744 0.3958596 0.820427427 0.90967967
-3 0.2876249 -0.2008143 18.06216 0.1893255 0.909679672 0.90967967
-> design <- model.matrix(~0+group)
-> fit <- glmFit(y,design,dispersion=2/3,prior.count=0.5/7)
-> lrt <- glmLRT(fit,contrast=cbind(c(-1,1,0),c(0,-1,1),c(-1,0,1)))
-> topTags(lrt)
-Coefficient: LR test on 2 degrees of freedom
- logFC.1 logFC.2 logFC.3 logCPM LR PValue FDR
-1 -7.2381060 7.1759960 -0.0621100 17.19071 10.7712171 0.004582051 0.02291026
-5 -1.6684268 0.7357761 -0.9326507 17.33529 1.7309951 0.420842115 0.90967967
-2 1.2080938 -0.1660740 1.0420198 18.24544 1.0496688 0.591653347 0.90967967
-4 0.5416704 -0.6923084 -0.1506381 17.57744 0.3958596 0.820427427 0.90967967
-3 0.2876249 -0.4884392 -0.2008143 18.06216 0.1893255 0.909679672 0.90967967
->
-> # simple Good-Turing algorithm runs.
-> test1<-1:9
-> freq1<-c(2018046, 449721, 188933, 105668, 68379, 48190, 35709, 37710, 22280)
-> goodTuring(rep(test1, freq1))
-$P0
-[1] 0.3814719
-
-$proportion
-[1] 8.035111e-08 2.272143e-07 4.060582e-07 5.773690e-07 7.516705e-07
-[6] 9.276808e-07 1.104759e-06 1.282549e-06 1.460837e-06
-
-$count
-[1] 1 2 3 4 5 6 7 8 9
-
-$n
-[1] 2018046 449721 188933 105668 68379 48190 35709 37710 22280
-
-$n0
-[1] 0
-
-> test2<-c(312, 14491, 16401, 65124, 129797, 323321, 366051, 368599, 405261, 604962)
-> goodTuring(test2)
-$P0
-[1] 0
-
-$proportion
- [1] 0.0001362656 0.0063162959 0.0071487846 0.0283850925 0.0565733349
- [6] 0.1409223124 0.1595465235 0.1606570896 0.1766365144 0.2636777866
-
-$count
- [1] 312 14491 16401 65124 129797 323321 366051 368599 405261 604962
-
-$n
- [1] 1 1 1 1 1 1 1 1 1 1
-
-$n0
-[1] 0
-
->
->
->
-> proc.time()
- user system elapsed
- 3.29 0.14 4.24
+
+R version 3.2.4 Revised (2016-03-16 r70336) -- "Very Secure Dishes"
+Copyright (C) 2016 The R Foundation for Statistical Computing
+Platform: x86_64-w64-mingw32/x64 (64-bit)
+
+R is free software and comes with ABSOLUTELY NO WARRANTY.
+You are welcome to redistribute it under certain conditions.
+Type 'license()' or 'licence()' for distribution details.
+
+ Natural language support but running in an English locale
+
+R is a collaborative project with many contributors.
+Type 'contributors()' for more information and
+'citation()' on how to cite R or R packages in publications.
+
+Type 'demo()' for some demos, 'help()' for on-line help, or
+'help.start()' for an HTML browser interface to help.
+Type 'q()' to quit R.
+
+> library(edgeR)
+Loading required package: limma
+>
+> set.seed(0); u <- runif(100)
+>
+> # generate raw counts from NB, create list object
+> y <- matrix(rnbinom(80,size=5,mu=10),nrow=20)
+> y <- rbind(0,c(0,0,2,2),y)
+> rownames(y) <- paste("Tag",1:nrow(y),sep=".")
+> d <- DGEList(counts=y,group=rep(1:2,each=2),lib.size=1001:1004)
+>
+> # estimate common dispersion and find differences in expression
+> d <- estimateCommonDisp(d)
+> d$common.dispersion
+[1] 0.210292
+> de <- exactTest(d)
+> summary(de$table)
+ logFC logCPM PValue
+ Min. :-1.7266 Min. :10.96 Min. :0.01976
+ 1st Qu.:-0.4855 1st Qu.:13.21 1st Qu.:0.33120
+ Median : 0.2253 Median :13.37 Median :0.56514
+ Mean : 0.1877 Mean :13.26 Mean :0.54504
+ 3rd Qu.: 0.5258 3rd Qu.:13.70 3rd Qu.:0.81052
+ Max. : 4.0861 Max. :14.31 Max. :1.00000
+> topTags(de)
+Comparison of groups: 2-1
+ logFC logCPM PValue FDR
+Tag.17 2.0450964 13.73726 0.01975954 0.4347099
+Tag.21 -1.7265870 13.38327 0.06131012 0.6744114
+Tag.6 -1.6329986 12.81479 0.12446044 0.8982100
+Tag.2 4.0861092 11.54121 0.16331090 0.8982100
+Tag.16 0.9324996 13.57074 0.29050785 0.9655885
+Tag.20 0.8543138 13.76364 0.31736609 0.9655885
+Tag.12 0.7081170 14.31389 0.37271028 0.9655885
+Tag.19 -0.7976602 13.31405 0.40166354 0.9655885
+Tag.3 -0.7300410 13.54155 0.42139935 0.9655885
+Tag.8 -0.7917906 12.86353 0.47117217 0.9655885
+>
+> d2 <- estimateTagwiseDisp(d,trend="none",prior.df=20)
+> summary(d2$tagwise.dispersion)
+ Min. 1st Qu. Median Mean 3rd Qu. Max.
+ 0.1757 0.1896 0.1989 0.2063 0.2185 0.2677
+> de <- exactTest(d2,dispersion="common")
+> topTags(de)
+Comparison of groups: 2-1
+ logFC logCPM PValue FDR
+Tag.17 2.0450964 13.73726 0.01975954 0.4347099
+Tag.21 -1.7265870 13.38327 0.06131012 0.6744114
+Tag.6 -1.6329986 12.81479 0.12446044 0.8982100
+Tag.2 4.0861092 11.54121 0.16331090 0.8982100
+Tag.16 0.9324996 13.57074 0.29050785 0.9655885
+Tag.20 0.8543138 13.76364 0.31736609 0.9655885
+Tag.12 0.7081170 14.31389 0.37271028 0.9655885
+Tag.19 -0.7976602 13.31405 0.40166354 0.9655885
+Tag.3 -0.7300410 13.54155 0.42139935 0.9655885
+Tag.8 -0.7917906 12.86353 0.47117217 0.9655885
+>
+> de <- exactTest(d2)
+> topTags(de)
+Comparison of groups: 2-1
+ logFC logCPM PValue FDR
+Tag.17 2.0450987 13.73726 0.01327001 0.2919403
+Tag.21 -1.7265897 13.38327 0.05683886 0.6252275
+Tag.6 -1.6329910 12.81479 0.11460208 0.8404152
+Tag.2 4.0861092 11.54121 0.16126207 0.8869414
+Tag.16 0.9324975 13.57074 0.28103256 0.9669238
+Tag.20 0.8543178 13.76364 0.30234789 0.9669238
+Tag.12 0.7081149 14.31389 0.37917895 0.9669238
+Tag.19 -0.7976633 13.31405 0.40762735 0.9669238
+Tag.3 -0.7300478 13.54155 0.40856822 0.9669238
+Tag.8 -0.7918243 12.86353 0.49005179 0.9669238
+>
+> d2 <- estimateTagwiseDisp(d,trend="movingave",span=0.4,prior.df=20)
+> summary(d2$tagwise.dispersion)
+ Min. 1st Qu. Median Mean 3rd Qu. Max.
+ 0.1005 0.1629 0.2064 0.2077 0.2585 0.3164
+> de <- exactTest(d2)
+> topTags(de)
+Comparison of groups: 2-1
+ logFC logCPM PValue FDR
+Tag.17 2.0450951 13.73726 0.02427872 0.5341319
+Tag.21 -1.7265927 13.38327 0.05234833 0.5758316
+Tag.6 -1.6330014 12.81479 0.12846308 0.8954397
+Tag.2 4.0861092 11.54121 0.16280722 0.8954397
+Tag.16 0.9324887 13.57074 0.24308201 0.9711975
+Tag.20 0.8543044 13.76364 0.35534649 0.9711975
+Tag.19 -0.7976535 13.31405 0.38873717 0.9711975
+Tag.3 -0.7300525 13.54155 0.40001438 0.9711975
+Tag.12 0.7080985 14.31389 0.43530227 0.9711975
+Tag.8 -0.7918376 12.86353 0.49782701 0.9711975
+>
+> summary(exactTest(d2,rejection="smallp")$table$PValue)
+ Min. 1st Qu. Median Mean 3rd Qu. Max.
+0.02428 0.36370 0.55660 0.54320 0.78890 1.00000
+> summary(exactTest(d2,rejection="deviance")$table$PValue)
+ Min. 1st Qu. Median Mean 3rd Qu. Max.
+0.02428 0.36370 0.55660 0.54320 0.78890 1.00000
+>
+> d2 <- estimateTagwiseDisp(d,trend="loess",span=0.8,prior.df=20)
+> summary(d2$tagwise.dispersion)
+ Min. 1st Qu. Median Mean 3rd Qu. Max.
+ 0.1165 0.1449 0.1832 0.1848 0.2116 0.2825
+> de <- exactTest(d2)
+> topTags(de)
+Comparison of groups: 2-1
+ logFC logCPM PValue FDR
+Tag.17 2.0450979 13.73726 0.01546795 0.3402949
+Tag.21 -1.7266049 13.38327 0.03545446 0.3899990
+Tag.6 -1.6329841 12.81479 0.10632987 0.7797524
+Tag.2 4.0861092 11.54121 0.16057893 0.8831841
+Tag.16 0.9324935 13.57074 0.26348818 0.9658389
+Tag.20 0.8543140 13.76364 0.31674090 0.9658389
+Tag.19 -0.7976354 13.31405 0.35564858 0.9658389
+Tag.3 -0.7300593 13.54155 0.38833737 0.9658389
+Tag.12 0.7081041 14.31389 0.41513004 0.9658389
+Tag.8 -0.7918152 12.86353 0.48483449 0.9658389
+>
+> d2 <- estimateTagwiseDisp(d,trend="tricube",span=0.8,prior.df=20)
+> summary(d2$tagwise.dispersion)
+ Min. 1st Qu. Median Mean 3rd Qu. Max.
+ 0.1165 0.1449 0.1832 0.1848 0.2116 0.2825
+> de <- exactTest(d2)
+> topTags(de)
+Comparison of groups: 2-1
+ logFC logCPM PValue FDR
+Tag.17 2.0450979 13.73726 0.01546795 0.3402949
+Tag.21 -1.7266049 13.38327 0.03545446 0.3899990
+Tag.6 -1.6329841 12.81479 0.10632987 0.7797524
+Tag.2 4.0861092 11.54121 0.16057893 0.8831841
+Tag.16 0.9324935 13.57074 0.26348818 0.9658389
+Tag.20 0.8543140 13.76364 0.31674090 0.9658389
+Tag.19 -0.7976354 13.31405 0.35564858 0.9658389
+Tag.3 -0.7300593 13.54155 0.38833737 0.9658389
+Tag.12 0.7081041 14.31389 0.41513004 0.9658389
+Tag.8 -0.7918152 12.86353 0.48483449 0.9658389
+>
+> # mglmOneWay
+> design <- model.matrix(~group,data=d$samples)
+> mglmOneWay(d[1:10,],design,dispersion=0.2)
+$coefficients
+ (Intercept) group2
+ [1,] -1.000000e+08 0.000000e+00
+ [2,] -1.000000e+08 1.000000e+08
+ [3,] 2.525729e+00 -5.108256e-01
+ [4,] 2.525729e+00 1.484200e-01
+ [5,] 2.140066e+00 -1.941560e-01
+ [6,] 2.079442e+00 -1.163151e+00
+ [7,] 2.014903e+00 2.363888e-01
+ [8,] 1.945910e+00 -5.596158e-01
+ [9,] 1.504077e+00 2.006707e-01
+[10,] 2.302585e+00 2.623643e-01
+
+$fitted.values
+ [,1] [,2] [,3] [,4]
+ [1,] 0.0 0.0 0.0 0.0
+ [2,] 0.0 0.0 2.0 2.0
+ [3,] 12.5 12.5 7.5 7.5
+ [4,] 12.5 12.5 14.5 14.5
+ [5,] 8.5 8.5 7.0 7.0
+ [6,] 8.0 8.0 2.5 2.5
+ [7,] 7.5 7.5 9.5 9.5
+ [8,] 7.0 7.0 4.0 4.0
+ [9,] 4.5 4.5 5.5 5.5
+[10,] 10.0 10.0 13.0 13.0
+
+> mglmOneWay(d[1:10,],design,dispersion=0)
+$coefficients
+ (Intercept) group2
+ [1,] -1.000000e+08 0.000000e+00
+ [2,] -1.000000e+08 1.000000e+08
+ [3,] 2.525729e+00 -5.108256e-01
+ [4,] 2.525729e+00 1.484200e-01
+ [5,] 2.140066e+00 -1.941560e-01
+ [6,] 2.079442e+00 -1.163151e+00
+ [7,] 2.014903e+00 2.363888e-01
+ [8,] 1.945910e+00 -5.596158e-01
+ [9,] 1.504077e+00 2.006707e-01
+[10,] 2.302585e+00 2.623643e-01
+
+$fitted.values
+ [,1] [,2] [,3] [,4]
+ [1,] 0.0 0.0 0.0 0.0
+ [2,] 0.0 0.0 2.0 2.0
+ [3,] 12.5 12.5 7.5 7.5
+ [4,] 12.5 12.5 14.5 14.5
+ [5,] 8.5 8.5 7.0 7.0
+ [6,] 8.0 8.0 2.5 2.5
+ [7,] 7.5 7.5 9.5 9.5
+ [8,] 7.0 7.0 4.0 4.0
+ [9,] 4.5 4.5 5.5 5.5
+[10,] 10.0 10.0 13.0 13.0
+
+>
+> fit <- glmFit(d,design,dispersion=d$common.dispersion,prior.count=0.5/4)
+> lrt <- glmLRT(fit,coef=2)
+> topTags(lrt)
+Coefficient: group2
+ logFC logCPM LR PValue FDR
+Tag.17 2.0450964 13.73726 6.0485417 0.01391779 0.3058698
+Tag.2 4.0861092 11.54121 4.8400340 0.02780635 0.3058698
+Tag.21 -1.7265870 13.38327 4.0777825 0.04345065 0.3186381
+Tag.6 -1.6329986 12.81479 3.0078205 0.08286364 0.4557500
+Tag.16 0.9324996 13.57074 1.3477682 0.24566867 0.8276702
+Tag.20 0.8543138 13.76364 1.1890032 0.27553071 0.8276702
+Tag.19 -0.7976602 13.31405 0.9279151 0.33540526 0.8276702
+Tag.12 0.7081170 14.31389 0.9095513 0.34023349 0.8276702
+Tag.3 -0.7300410 13.54155 0.8300307 0.36226364 0.8276702
+Tag.8 -0.7917906 12.86353 0.7830377 0.37621371 0.8276702
+>
+> fit <- glmFit(d,design,dispersion=d$common.dispersion,prior.count=0.5)
+> summary(fit$coef)
+ (Intercept) group2
+ Min. :-7.604 Min. :-1.13681
+ 1st Qu.:-4.895 1st Qu.:-0.32341
+ Median :-4.713 Median : 0.15083
+ Mean :-4.940 Mean : 0.07817
+ 3rd Qu.:-4.524 3rd Qu.: 0.35163
+ Max. :-4.107 Max. : 1.60864
+>
+> fit <- glmFit(d,design,prior.count=0.5/4)
+> lrt <- glmLRT(fit,coef=2)
+> topTags(lrt)
+Coefficient: group2
+ logFC logCPM LR PValue FDR
+Tag.17 2.0450964 13.73726 6.0485417 0.01391779 0.3058698
+Tag.2 4.0861092 11.54121 4.8400340 0.02780635 0.3058698
+Tag.21 -1.7265870 13.38327 4.0777825 0.04345065 0.3186381
+Tag.6 -1.6329986 12.81479 3.0078205 0.08286364 0.4557500
+Tag.16 0.9324996 13.57074 1.3477682 0.24566867 0.8276702
+Tag.20 0.8543138 13.76364 1.1890032 0.27553071 0.8276702
+Tag.19 -0.7976602 13.31405 0.9279151 0.33540526 0.8276702
+Tag.12 0.7081170 14.31389 0.9095513 0.34023349 0.8276702
+Tag.3 -0.7300410 13.54155 0.8300307 0.36226364 0.8276702
+Tag.8 -0.7917906 12.86353 0.7830377 0.37621371 0.8276702
+>
+> dglm <- estimateGLMCommonDisp(d,design)
+> dglm$common.dispersion
+[1] 0.2033282
+> dglm <- estimateGLMTagwiseDisp(dglm,design,prior.df=20)
+> summary(dglm$tagwise.dispersion)
+ Min. 1st Qu. Median Mean 3rd Qu. Max.
+ 0.1756 0.1879 0.1998 0.2031 0.2135 0.2578
+> fit <- glmFit(dglm,design,prior.count=0.5/4)
+> lrt <- glmLRT(fit,coef=2)
+> topTags(lrt)
+Coefficient: group2
+ logFC logCPM LR PValue FDR
+Tag.17 2.0450988 13.73727 6.8001118 0.009115216 0.2005348
+Tag.2 4.0861092 11.54122 4.8594088 0.027495756 0.2872068
+Tag.21 -1.7265904 13.38327 4.2537154 0.039164558 0.2872068
+Tag.6 -1.6329904 12.81479 3.1763761 0.074710253 0.4109064
+Tag.16 0.9324970 13.57074 1.4126709 0.234613512 0.8499599
+Tag.20 0.8543183 13.76364 1.2721097 0.259371274 0.8499599
+Tag.19 -0.7976614 13.31405 0.9190392 0.337727381 0.8499599
+Tag.12 0.7081163 14.31389 0.9014515 0.342392806 0.8499599
+Tag.3 -0.7300488 13.54155 0.8817937 0.347710872 0.8499599
+Tag.8 -0.7918166 12.86353 0.7356185 0.391068049 0.8603497
+> dglm <- estimateGLMTrendedDisp(dglm,design)
+> summary(dglm$trended.dispersion)
+ Min. 1st Qu. Median Mean 3rd Qu. Max.
+ 0.1522 0.1676 0.1740 0.1887 0.2000 0.3469
+> dglm <- estimateGLMTrendedDisp(dglm,design,method="power")
+> summary(dglm$trended.dispersion)
+ Min. 1st Qu. Median Mean 3rd Qu. Max.
+ 0.1522 0.1676 0.1740 0.1887 0.2000 0.3469
+> dglm <- estimateGLMTrendedDisp(dglm,design,method="spline")
+> summary(dglm$trended.dispersion)
+ Min. 1st Qu. Median Mean 3rd Qu. Max.
+0.09353 0.11080 0.15460 0.19010 0.23050 0.52010
+> dglm <- estimateGLMTrendedDisp(dglm,design,method="bin.spline")
+> summary(dglm$trended.dispersion)
+ Min. 1st Qu. Median Mean 3rd Qu. Max.
+ 0.1997 0.1997 0.1997 0.1997 0.1997 0.1997
+> dglm <- estimateGLMTagwiseDisp(dglm,design,prior.df=20)
+> summary(dglm$tagwise.dispersion)
+ Min. 1st Qu. Median Mean 3rd Qu. Max.
+ 0.1385 0.1792 0.1964 0.1935 0.2026 0.2709
+>
+> dglm2 <- estimateDisp(dglm, design)
+> summary(dglm2$tagwise.dispersion)
+ Min. 1st Qu. Median Mean 3rd Qu. Max.
+ 0.1766 0.1789 0.1814 0.1846 0.1870 0.2119
+> dglm2 <- estimateDisp(dglm, design, prior.df=20)
+> summary(dglm2$tagwise.dispersion)
+ Min. 1st Qu. Median Mean 3rd Qu. Max.
+ 0.1527 0.1669 0.1814 0.1858 0.1951 0.2497
+> dglm2 <- estimateDisp(dglm, design, robust=TRUE)
+> summary(dglm2$tagwise.dispersion)
+ Min. 1st Qu. Median Mean 3rd Qu. Max.
+ 0.1766 0.1789 0.1814 0.1846 0.1870 0.2119
+>
+> # Continuous trend
+> nlibs <- 3
+> ntags <- 1000
+> dispersion.true <- 0.1
+> # Make first transcript respond to covariate x
+> x <- 0:2
+> design <- model.matrix(~x)
+> beta.true <- cbind(Beta1=2,Beta2=c(2,rep(0,ntags-1)))
+> mu.true <- 2^(beta.true %*% t(design))
+> # Generate count data
+> y <- rnbinom(ntags*nlibs,mu=mu.true,size=1/dispersion.true)
+> y <- matrix(y,ntags,nlibs)
+> colnames(y) <- c("x0","x1","x2")
+> rownames(y) <- paste("Gene",1:ntags,sep="")
+> d <- DGEList(y)
+> d <- calcNormFactors(d)
+> fit <- glmFit(d, design, dispersion=dispersion.true, prior.count=0.5/3)
+> results <- glmLRT(fit, coef=2)
+> topTags(results)
+Coefficient: x
+ logFC logCPM LR PValue FDR
+Gene1 2.907024 13.56183 38.738512 4.845536e-10 4.845536e-07
+Gene61 2.855317 10.27136 10.738307 1.049403e-03 5.247015e-01
+Gene62 -2.123902 10.53174 8.818703 2.981585e-03 8.334760e-01
+Gene134 -1.949073 10.53355 8.125889 4.363759e-03 8.334760e-01
+Gene740 -1.610046 10.94907 8.013408 4.643227e-03 8.334760e-01
+Gene354 2.022698 10.45066 7.826308 5.149118e-03 8.334760e-01
+Gene5 1.856816 10.45249 7.214238 7.232750e-03 8.334760e-01
+Gene746 -1.798331 10.53094 6.846262 8.882693e-03 8.334760e-01
+Gene110 1.623148 10.68607 6.737984 9.438120e-03 8.334760e-01
+Gene383 1.637140 10.75412 6.687530 9.708965e-03 8.334760e-01
+> d1 <- estimateGLMCommonDisp(d, design, verbose=TRUE)
+Disp = 0.10253 , BCV = 0.3202
+> glmFit(d,design,dispersion=dispersion.true, prior.count=0.5/3)
+An object of class "DGEGLM"
+$coefficients
+ (Intercept) x
+Gene1 -7.391745 2.0149958
+Gene2 -7.318483 -0.7611895
+Gene3 -6.831702 -0.1399478
+Gene4 -7.480255 0.5172002
+Gene5 -8.747793 1.2870467
+995 more rows ...
+
+$fitted.values
+ x0 x1 x2
+Gene1 2.3570471 18.954454 138.2791328
+Gene2 2.5138172 1.089292 0.4282107
+Gene3 4.1580452 3.750528 3.0690081
+Gene4 2.1012460 3.769592 6.1349937
+Gene5 0.5080377 2.136398 8.1502486
+995 more rows ...
+
+$deviance
+[1] 6.38037545 1.46644913 1.38532340 0.01593969 1.03894513
+995 more elements ...
+
+$iter
+[1] 8 4 4 4 6
+995 more elements ...
+
+$failed
+[1] FALSE FALSE FALSE FALSE FALSE
+995 more elements ...
+
+$method
+[1] "levenberg"
+
+$counts
+ x0 x1 x2
+Gene1 0 30 110
+Gene2 2 2 0
+Gene3 3 6 2
+Gene4 2 4 6
+Gene5 1 1 9
+995 more rows ...
+
+$unshrunk.coefficients
+ (Intercept) x
+Gene1 -7.437763 2.0412762
+Gene2 -7.373370 -0.8796273
+Gene3 -6.870127 -0.1465014
+Gene4 -7.552642 0.5410832
+Gene5 -8.972372 1.3929679
+995 more rows ...
+
+$df.residual
+[1] 1 1 1 1 1
+995 more elements ...
+
+$design
+ (Intercept) x
+1 1 0
+2 1 1
+3 1 2
+attr(,"assign")
+[1] 0 1
+
+$offset
+ [,1] [,2] [,3]
+[1,] 8.295172 8.338525 8.284484
+[2,] 8.295172 8.338525 8.284484
+[3,] 8.295172 8.338525 8.284484
+[4,] 8.295172 8.338525 8.284484
+[5,] 8.295172 8.338525 8.284484
+995 more rows ...
+
+$dispersion
+[1] 0.1
+
+$prior.count
+[1] 0.1666667
+
+$samples
+ group lib.size norm.factors
+x0 1 4001 1.0008730
+x1 1 4176 1.0014172
+x2 1 3971 0.9977138
+
+$AveLogCPM
+[1] 13.561832 9.682757 10.447014 10.532113 10.452489
+995 more elements ...
+
+>
+> d2 <- estimateDisp(d, design)
+> summary(d2$tagwise.dispersion)
+ Min. 1st Qu. Median Mean 3rd Qu. Max.
+0.05545 0.09511 0.11620 0.11010 0.13330 0.16860
+> d2 <- estimateDisp(d, design, prior.df=20)
+> summary(d2$tagwise.dispersion)
+ Min. 1st Qu. Median Mean 3rd Qu. Max.
+0.04203 0.08586 0.11280 0.11010 0.12370 0.37410
+> d2 <- estimateDisp(d, design, robust=TRUE)
+> summary(d2$tagwise.dispersion)
+ Min. 1st Qu. Median Mean 3rd Qu. Max.
+0.05545 0.09511 0.11620 0.11010 0.13330 0.16860
+>
+> # Exact tests
+> y <- matrix(rnbinom(20,mu=10,size=3/2),5,4)
+> group <- factor(c(1,1,2,2))
+> ys <- splitIntoGroupsPseudo(y,group,pair=c(1,2))
+> exactTestDoubleTail(ys$y1,ys$y2,dispersion=2/3)
+[1] 0.1334396 0.6343568 0.7280015 0.7124912 0.3919258
+>
+> y <- matrix(rnbinom(5*7,mu=10,size=3/2),5,7)
+> group <- factor(c(1,1,2,2,3,3,3))
+> ys <- splitIntoGroupsPseudo(y,group,pair=c(1,3))
+> exactTestDoubleTail(ys$y1,ys$y2,dispersion=2/3)
+[1] 1.0000000 0.4486382 1.0000000 0.9390317 0.4591241
+> exactTestBetaApprox(ys$y1,ys$y2,dispersion=2/3)
+[1] 1.0000000 0.4492969 1.0000000 0.9421695 0.4589194
+>
+> y[1,3:4] <- 0
+> design <- model.matrix(~group)
+> fit <- glmFit(y,design,dispersion=2/3,prior.count=0.5/7)
+> summary(fit$coef)
+ (Intercept) group2 group3
+ Min. :-1.817 Min. :-5.0171 Min. :-0.64646
+ 1st Qu.:-1.812 1st Qu.:-1.1565 1st Qu.:-0.13919
+ Median :-1.712 Median : 0.1994 Median :-0.10441
+ Mean :-1.625 Mean :-0.9523 Mean :-0.04217
+ 3rd Qu.:-1.429 3rd Qu.: 0.3755 3rd Qu.:-0.04305
+ Max. :-1.356 Max. : 0.8374 Max. : 0.72227
+>
+> lrt <- glmLRT(fit,contrast=cbind(c(0,1,0),c(0,0,1)))
+> topTags(lrt)
+Coefficient: LR test of 2 contrasts
+ logFC.1 logFC.2 logCPM LR PValue FDR
+1 -7.2381060 -0.0621100 17.19071 10.7712171 0.004582051 0.02291026
+5 -1.6684268 -0.9326507 17.33529 1.7309951 0.420842115 0.90967967
+2 1.2080938 1.0420198 18.24544 1.0496688 0.591653347 0.90967967
+4 0.5416704 -0.1506381 17.57744 0.3958596 0.820427427 0.90967967
+3 0.2876249 -0.2008143 18.06216 0.1893255 0.909679672 0.90967967
+> design <- model.matrix(~0+group)
+> fit <- glmFit(y,design,dispersion=2/3,prior.count=0.5/7)
+> lrt <- glmLRT(fit,contrast=cbind(c(-1,1,0),c(0,-1,1),c(-1,0,1)))
+> topTags(lrt)
+Coefficient: LR test of 2 contrasts
+ logFC.1 logFC.2 logCPM LR PValue FDR
+1 -7.2381060 7.1759960 17.19071 10.7712171 0.004582051 0.02291026
+5 -1.6684268 0.7357761 17.33529 1.7309951 0.420842115 0.90967967
+2 1.2080938 -0.1660740 18.24544 1.0496688 0.591653347 0.90967967
+4 0.5416704 -0.6923084 17.57744 0.3958596 0.820427427 0.90967967
+3 0.2876249 -0.4884392 18.06216 0.1893255 0.909679672 0.90967967
+>
+> # simple Good-Turing algorithm runs.
+> test1<-1:9
+> freq1<-c(2018046, 449721, 188933, 105668, 68379, 48190, 35709, 37710, 22280)
+> goodTuring(rep(test1, freq1))
+$P0
+[1] 0.3814719
+
+$proportion
+[1] 8.035111e-08 2.272143e-07 4.060582e-07 5.773690e-07 7.516705e-07
+[6] 9.276808e-07 1.104759e-06 1.282549e-06 1.460837e-06
+
+$count
+[1] 1 2 3 4 5 6 7 8 9
+
+$n
+[1] 2018046 449721 188933 105668 68379 48190 35709 37710 22280
+
+$n0
+[1] 0
+
+> test2<-c(312, 14491, 16401, 65124, 129797, 323321, 366051, 368599, 405261, 604962)
+> goodTuring(test2)
+$P0
+[1] 0
+
+$proportion
+ [1] 0.0001362656 0.0063162959 0.0071487846 0.0283850925 0.0565733349
+ [6] 0.1409223124 0.1595465235 0.1606570896 0.1766365144 0.2636777866
+
+$count
+ [1] 312 14491 16401 65124 129797 323321 366051 368599 405261 604962
+
+$n
+ [1] 1 1 1 1 1 1 1 1 1 1
+
+$n0
+[1] 0
+
+>
+>
+>
+> proc.time()
+ user system elapsed
+ 3.58 0.03 3.61
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/r-bioc-edger.git
More information about the debian-med-commit
mailing list