[med-svn] [r-bioc-edger] 02/04: Imported Upstream version 3.10.3+dfsg
Andreas Tille
tille at debian.org
Sat Sep 26 16:38:48 UTC 2015
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository r-bioc-edger.
commit 76af3f2b01f8ae5e0508b71e7abc297e50e5f5ff
Author: Andreas Tille <tille at debian.org>
Date: Sat Sep 26 18:34:45 2015 +0200
Imported Upstream version 3.10.3+dfsg
---
DESCRIPTION | 6 +-
NAMESPACE | 43 +++++----
R/S3methods.R | 23 ++++-
R/adjustedProfileLik.R | 6 +-
R/calcNormFactors.R | 40 ++++----
R/camera.DGEList.R | 39 --------
R/cpm.R | 4 +-
R/diffSpliceDGE.R | 64 +++++++++----
R/estimateDisp.R | 1 +
R/estimateTrendedDisp.R | 23 ++---
R/expandAsMatrix.R | 7 +-
R/geneset-DGEList.R | 80 ++++++++++++++++
R/glmQLFTest.R | 11 +--
R/glmTreat.R | 135 ++++++++++++++++++---------
R/glmfit.R | 34 ++-----
R/goana.R | 53 ++++++-----
R/kegga.R | 71 ++++++++++++++
R/mglmLevenberg.R | 4 +-
R/mglmOneGroup.R | 5 +-
R/mglmOneWay.R | 3 +-
R/plotMD.R | 48 ++++++++++
R/readDGE.R | 40 +++++---
R/roast.DGEList.R | 82 ----------------
R/romer.DGEList.R | 39 --------
R/rpkm.R | 4 +-
R/subsetting.R | 10 +-
build/vignette.rds | Bin 228 -> 229 bytes
inst/doc/edgeR.pdf | Bin 48664 -> 48664 bytes
man/camera.DGEList.Rd | 2 +-
man/cpm.Rd | 2 +-
man/diffSpliceDGE.Rd | 3 +-
man/dimnames.Rd | 4 +-
man/estimateCommonDisp.Rd | 40 ++++----
man/expandAsMatrix.Rd | 3 +-
man/glmQLFTest.Rd | 4 +-
man/glmTreat.Rd | 14 ++-
man/glmfit.Rd | 9 +-
man/goana.Rd | 26 +++---
man/plotMD.Rd | 77 +++++++++++++++
man/readDGE.Rd | 36 +++++---
man/roast.DGEList.Rd | 2 +-
man/romer.DGEList.Rd | 2 +-
src/R_levenberg.cpp | 14 +--
src/R_one_group.cpp | 11 ++-
src/glm.h | 6 +-
src/glm_levenberg.cpp | 10 +-
src/glm_one_group.cpp | 4 +-
src/matvec_check.cpp | 22 +----
src/matvec_check.h | 2 +-
tests/edgeR-Tests.Rout.save | 221 ++++++++++++++++++++++++++++++++++----------
vignettes/edgeR.Rnw | 48 ++++++++++
51 files changed, 923 insertions(+), 514 deletions(-)
diff --git a/DESCRIPTION b/DESCRIPTION
index e700f3b..e88211e 100755
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
Package: edgeR
-Version: 3.10.2
-Date: 2015/05/24
+Version: 3.10.3
+Date: 2015/09/24
Title: Empirical analysis of digital gene expression data in R
Description: Differential expression analysis of RNA-seq expression profiles with biological replication. Implements a range of statistical methodology based on the negative binomial distributions, including empirical Bayes estimation, exact tests, generalized linear models and quasi-likelihood tests. As well as RNA-seq, it be applied to differential signal analysis of other types of genomic data that produce counts, including ChIP-seq, SAGE and CAGE.
Author: Yunshun Chen <yuchen at wehi.edu.au>, Aaron Lun <alun at wehi.edu.au>, Davis McCarthy <dmccarthy at wehi.edu.au>, Xiaobei Zhou <xiaobei.zhou at uzh.ch>, Mark Robinson <mark.robinson at imls.uzh.ch>, Gordon Smyth <smyth at wehi.edu.au>
@@ -16,4 +16,4 @@ biocViews: GeneExpression, Transcription, AlternativeSplicing,
TimeCourse, SAGE, Sequencing, ChIPSeq, RNASeq, BatchEffect,
MultipleComparison, Normalization, QualityControl
NeedsCompilation: yes
-Packaged: 2015-05-26 01:55:23 UTC; biocbuild
+Packaged: 2015-09-25 01:53:01 UTC; biocbuild
diff --git a/NAMESPACE b/NAMESPACE
index 1379f7c..a9cce9e 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -8,13 +8,18 @@ exportClasses("DGEList","DGEExact","DGEGLM","DGELRT","TopTags")
exportMethods("show")
import(methods)
-importFrom("limma",camera,contrastAsCoef,decideTests,goana,is.fullrank,lmFit,loessFit,mroast,nonEstimable,plotMDS,removeExt,roast,romer,squeezeVar,subsetListOfArrays,tricubeMovingAverage,zscoreGamma)
+importFrom("limma",camera,contrastAsCoef,decideTests,goana,kegga,is.fullrank,lmFit,loessFit,mroast,nonEstimable,plotMD,plotMDS,plotWithHighlights,removeExt,roast,romer,squeezeVar,subsetListOfArrays,zscoreGamma)
importClassesFrom("limma","LargeDataObject","Roast","MDS")
S3method(as.data.frame,TopTags)
S3method(as.matrix,DGEList)
+S3method(aveLogCPM,default)
+S3method(aveLogCPM,DGEList)
+S3method(aveLogCPM,DGEGLM)
S3method(calcNormFactors,default)
S3method(calcNormFactors,DGEList)
+S3method(camera,DGEList)
+S3method(cpm,DGEList)
S3method(dim,DGEList)
S3method(dim,DGEExact)
S3method(dim,DGEGLM)
@@ -27,20 +32,7 @@ S3method(dimnames,DGELRT)
S3method(dimnames,TopTags)
S3method("dimnames<-",DGEList)
S3method("dimnames<-",DGEGLM)
-S3method(length,DGEList)
-S3method(length,DGEExact)
-S3method(length,TopTags)
-S3method(length,DGEGLM)
-S3method(length,DGELRT)
-S3method(plotMDS,DGEList)
-S3method(camera,DGEList)
-S3method(roast,DGEList)
-S3method(mroast,DGEList)
-S3method(romer,DGEList)
-S3method(aveLogCPM,default)
-S3method(aveLogCPM,DGEList)
-S3method(aveLogCPM,DGEGLM)
-S3method(cpm,DGEList)
+S3method("dimnames<-",DGELRT)
S3method(estimateGLMCommonDisp,default)
S3method(estimateGLMCommonDisp,DGEList)
S3method(estimateGLMTrendedDisp,default)
@@ -49,13 +41,26 @@ S3method(estimateGLMTagwiseDisp,default)
S3method(estimateGLMTagwiseDisp,DGEList)
S3method(glmFit,default)
S3method(glmFit,DGEList)
+S3method(glmQLFit,default)
+S3method(glmQLFit,DGEList)
+S3method(goana,DGEExact)
+S3method(goana,DGELRT)
+S3method(kegga,DGEExact)
+S3method(kegga,DGELRT)
+S3method(length,DGEList)
+S3method(length,DGEExact)
+S3method(length,TopTags)
+S3method(length,DGEGLM)
+S3method(length,DGELRT)
+S3method(mroast,DGEList)
+S3method(plotMD,DGEGLM)
+S3method(plotMD,DGELRT)
+S3method(plotMDS,DGEList)
S3method(predFC,default)
S3method(predFC,DGEList)
+S3method(roast,DGEList)
+S3method(romer,DGEList)
S3method(rpkm,default)
S3method(rpkm,DGEList)
S3method(sumTechReps,default)
S3method(sumTechReps,DGEList)
-S3method(goana,DGEExact)
-S3method(goana,DGELRT)
-S3method(glmQLFit,default)
-S3method(glmQLFit,DGEList)
\ No newline at end of file
diff --git a/R/S3methods.R b/R/S3methods.R
index 45df26e..116cd0c 100644
--- a/R/S3methods.R
+++ b/R/S3methods.R
@@ -39,10 +39,31 @@ assign("dimnames<-.DGEList",function(x,value)
x
})
+assign("dimnames<-.DGEExact",function(x,value)
+{
+ dimnames(x$table) <- value
+ if(!is.null(x$genes)) row.names(x$genes) <- value[[1]]
+ x
+})
+
assign("dimnames<-.DGEGLM",function(x,value)
{
dimnames(x$coefficients) <- value
- if(!is.null(x$samples)) row.names(x$samples) <- value[[2]]
+ if(!is.null(x$unshrunk.coefficients)) dimnames(x$unshrunk.coefficients) <- value
+ if(!is.null(x$fitted.values)) rownames(x$fitted.values) <- value[[1]]
+ if(!is.null(x$counts)) rownames(x$fitted.values) <- value[[1]]
+ if(!is.null(x$genes)) row.names(x$genes) <- value[[1]]
+ x
+})
+
+assign("dimnames<-.DGELRT",function(x,value)
+# 4 June 2015
+{
+ dimnames(x$table) <- value
+ if(!is.null(x$coefficients)) rownames(x$coefficients) <- value[[1]]
+ if(!is.null(x$unshrunk.coefficients)) rownames(x$unshrunk.coefficients) <- value[[1]]
+ if(!is.null(x$fitted.values)) rownames(x$fitted.values) <- value[[1]]
+ if(!is.null(x$counts)) rownames(x$fitted.values) <- value[[1]]
if(!is.null(x$genes)) row.names(x$genes) <- value[[1]]
x
})
diff --git a/R/adjustedProfileLik.R b/R/adjustedProfileLik.R
index 099e032..bc7266b 100644
--- a/R/adjustedProfileLik.R
+++ b/R/adjustedProfileLik.R
@@ -4,12 +4,12 @@ adjustedProfileLik <- function(dispersion, y, design, offset, weights=NULL, adju
# y is matrix: rows are genes/tags/transcripts, columns are samples/libraries
# offset is matrix of the same dimensions as y
# Yunshun Chen, Gordon Smyth, Aaron Lun
-# Created June 2010. Last modified 11 Sep 2014.
+# Created June 2010. Last modified 27 July 2015.
{
if(any(dim(y)!=dim(offset))) offset <- expandAsMatrix(offset,dim(y))
ntags <- nrow(y)
nlibs <- ncol(y)
- if(length(dispersion)==1) dispersion <- rep(dispersion,ntags)
+ dispersion <- expandAsMatrix(dispersion,dim(y),byrow=FALSE)
# Fit tagwise linear models. This is actually the most time-consuming
# operation that I can see for this function.
@@ -18,7 +18,7 @@ adjustedProfileLik <- function(dispersion, y, design, offset, weights=NULL, adju
# Compute log-likelihood.
mu <- fit$fitted
if(is.null(weights)) weights <- 1
- if(dispersion[1] == 0){
+ if(all(dispersion == 0)){
# loglik <- rowSums(weights*dpois(y,lambda=mu,log = TRUE))
ll <- y*log(mu)-mu-lgamma(y+1)
ll[mu==0] <- 0
diff --git a/R/calcNormFactors.R b/R/calcNormFactors.R
index 10de754..c1145c6 100644
--- a/R/calcNormFactors.R
+++ b/R/calcNormFactors.R
@@ -2,40 +2,32 @@ calcNormFactors <- function(object, ...)
UseMethod("calcNormFactors")
calcNormFactors.DGEList <- function(object, method=c("TMM","RLE","upperquartile","none"), refColumn=NULL, logratioTrim=.3, sumTrim=0.05, doWeighting=TRUE, Acutoff=-1e10, p=0.75, ...)
-# Scale normalization of RNA-Seq data.
-# Mark Robinson. Edits by Gordon Smyth.
-# Created October 22 October 2009. Last modified 2 Oct 2014.
+# Scale normalization of RNA-Seq data, for DGEList objects
+# Created 2 October 2014. Last modified 27 August 2015.
{
-# Check object
- x <- as.matrix(object$counts)
- lib.size <- object$samples$lib.size
- if(any(is.na(x))) stop("NAs not permitted")
-
- f <- NextMethod(object=x, lib.size=lib.size, method=method, refColumn=refColumn, logratioTrim=logratioTrim, sumTrim=sumTrim, doWeighting=doWeighting, Acutoff=Acutoff, p=p)
-
-# Output
- object$samples$norm.factors <- f
- return(object)
-
+ object$samples$norm.factors <- calcNormFactors(object=object$counts, lib.size=object$samples$lib.size, method=method, refColumn=refColumn, logratioTrim=logratioTrim, sumTrim=sumTrim, doWeighting=doWeighting, Acutoff=Acutoff, p=p)
+ object
}
calcNormFactors.default <- function(object, lib.size=NULL, method=c("TMM","RLE","upperquartile","none"), refColumn=NULL, logratioTrim=.3, sumTrim=0.05, doWeighting=TRUE, Acutoff=-1e10, p=0.75, ...)
-# Scale normalization of RNA-Seq data.
+# Scale normalization of RNA-Seq data, for count matrices
# Mark Robinson. Edits by Gordon Smyth.
-# Created October 22 October 2009. Last modified 2 Oct 2014.
+# Created October 22 October 2009 by Mark Robinson.
+# Last modified 31 July 2015.
{
# Check object
x <- as.matrix(object)
- if(any(is.na(x))) stop("NAs not permitted")
+ if(any(is.na(x))) stop("NA counts not permitted")
# Check lib.size
if(is.null(lib.size)) lib.size <- colSums(x)
+ if(any(is.na(lib.size))) stop("NA lib.sizes not permitted")
# Check method
method <- match.arg(method)
# Remove all zero rows
- allzero <- rowSums(x>0) == 0
+ allzero <- .rowSums(x>0, nrow(x), ncol(x)) == 0
if(any(allzero)) x <- x[!allzero,,drop=FALSE]
# Degenerate cases
@@ -102,7 +94,8 @@ calcNormFactors.default <- function(object, lib.size=NULL, method=c("TMM","RLE",
v <- v[fin]
# taken from the original mean() function
- n <- sum(fin)
+ n <- length(logR)
+
loL <- floor(n * logratioTrim) + 1
hiL <- n + 1 - loL
loS <- floor(n * sumTrim) + 1
@@ -114,8 +107,13 @@ calcNormFactors.default <- function(object, lib.size=NULL, method=c("TMM","RLE",
keep <- (rank(logR)>=loL & rank(logR)<=hiL) & (rank(absE)>=loS & rank(absE)<=hiS)
if(doWeighting)
- 2^( sum(logR[keep]/v[keep], na.rm=TRUE) / sum(1/v[keep], na.rm=TRUE) )
+ f <- sum(logR[keep]/v[keep], na.rm=TRUE) / sum(1/v[keep], na.rm=TRUE)
else
- 2^( mean(logR[keep], na.rm=TRUE) )
+ f <- mean(logR[keep], na.rm=TRUE)
+
+# Results will be missing if the two libraries share no features with positive counts
+# In this case, return unity
+ if(is.na(f)) f <- 0
+ 2^f
}
diff --git a/R/camera.DGEList.R b/R/camera.DGEList.R
deleted file mode 100644
index a6385b3..0000000
--- a/R/camera.DGEList.R
+++ /dev/null
@@ -1,39 +0,0 @@
-camera.DGEList <- function(y, index, design=NULL, contrast=ncol(design), ...)
-# Rotation gene set testing for RNA-Seq data accounting for inter-gene correlation
-# Yunshun Chen, Gordon Smyth
-# Created 07 Jan 2013. Last modified 28 Feb 2014.
-{
-# Check dispersion estimates in y
- dispersion <- getDispersion(y)
- if(is.null(dispersion)) stop("Dispersion estimate not found. Please estimate the dispersion(s) before you proceed.")
-
-# Make default design matrix from group factor
- if(is.null(design)) {
- if(nlevels(y$samples$group)<2) stop("design not supplied and samples all belong to the same group")
- design <- model.matrix(~y$samples$group)
- rownames(design) <- colnames(y)
- }
- nbeta <- ncol(design)
- if(nbeta < 2) stop("design matrix must have at least two columns")
-
-# Check contrast
- if(length(contrast) == 1) {
- u <- rep.int(0, nbeta)
- u[contrast] <- 1
- contrast <- u
- }
- if(length(contrast) != nbeta) stop("length of contrast must match column dimension of design")
- if(all(contrast==0)) stop("contrast all zero")
-
-# Construct null hypothesis design matrix
- QR <- qr(contrast)
- design0 <- t(qr.qty(QR, t(design))[-1, , drop=FALSE])
-
-# Null hypothesis fit
- fit.null <- glmFit(y, design0, prior.count=0)
-
-# Quantile residuals from null fit
- y <- zscoreNBinom(y$counts, mu=fit.null$fitted.values, size=1/dispersion)
-
- NextMethod("camera")
-}
diff --git a/R/cpm.R b/R/cpm.R
index 8aa28c7..544e2b4 100644
--- a/R/cpm.R
+++ b/R/cpm.R
@@ -4,7 +4,7 @@ UseMethod("cpm")
cpm.DGEList <- function(x, normalized.lib.sizes=TRUE, log=FALSE, prior.count=0.25, ...)
# Counts per million for a DGEList
# Davis McCarthy and Gordon Smyth.
-# Created 20 June 2011. Last modified 1 November 2012
+# Created 20 June 2011. Last modified 6 July 2015
{
lib.size <- x$samples$lib.size
if(normalized.lib.sizes) lib.size <- lib.size*x$samples$norm.factors
@@ -14,7 +14,7 @@ cpm.DGEList <- function(x, normalized.lib.sizes=TRUE, log=FALSE, prior.count=0.2
cpm.default <- function(x, lib.size=NULL, log=FALSE, prior.count=0.25, ...)
# Counts per million for a matrix
# Davis McCarthy and Gordon Smyth.
-# Created 20 June 2011. Last modified 1 November 2012
+# Created 20 June 2011. Last modified 6 July 2015
{
x <- as.matrix(x)
if(is.null(lib.size)) lib.size <- colSums(x)
diff --git a/R/diffSpliceDGE.R b/R/diffSpliceDGE.R
index 4d36e66..a372ffb 100644
--- a/R/diffSpliceDGE.R
+++ b/R/diffSpliceDGE.R
@@ -1,8 +1,8 @@
-diffSpliceDGE <- function(fit.exon, coef=ncol(fit.exon$design), geneid, exonid=NULL, verbose=TRUE)
+diffSpliceDGE <- function(fit.exon, coef=ncol(fit.exon$design), contrast=NULL, geneid, exonid=NULL, verbose=TRUE)
{
# Identify exons and genes with splice variants using negative binomial GLMs
# Yunshun Chen and Gordon Smyth
-# Created 29 March 2014. Last modified 25 Aug 2014.
+# Created 29 March 2014. Last modified 13 May 2015.
# Check input (from diffSplice in limma)
exon.genes <- fit.exon$genes
@@ -35,9 +35,49 @@ diffSpliceDGE <- function(fit.exon, coef=ncol(fit.exon$design), geneid, exonid=N
o <- order(geneid,exonid)
geneid <- geneid[o]
exon.genes <- exon.genes[o,,drop=FALSE]
-
fit.exon <- fit.exon[o, ]
+# Check design matrix
+ design <- as.matrix(fit.exon$design)
+ nbeta <- ncol(design)
+ if(nbeta < 2) stop("Need at least two columns for design, usually the first is the intercept column")
+ coef.names <- colnames(design)
+
+ if(fit.exon$prior.count!=0){
+ coefficients.mle <- fit.exon$unshrunk.coefficients
+ } else {
+ coefficients.mle <- fit.exon$coefficients
+ }
+
+# Evaluate beta to be tested
+# Note that contrast takes precedence over coef: if contrast is given
+# then reform design matrix so that contrast of interest is the first column
+ if(is.null(contrast)) {
+ if(length(coef) > 1) coef <- unique(coef)
+ if(is.character(coef)) {
+ check.coef <- coef %in% colnames(design)
+ if(any(!check.coef)) stop("One or more named coef arguments do not match a column of the design matrix.")
+ coef.name <- coef
+ coef <- match(coef, colnames(design))
+ }
+ else
+ coef.name <- coef.names[coef]
+ beta <- coefficients.mle[, coef, drop=FALSE]
+ } else {
+ contrast <- as.matrix(contrast)
+ reform <- contrastAsCoef(design, contrast=contrast, first=TRUE)
+ coef <- 1
+ beta <- drop(coefficients.mle %*% contrast)
+ contrast <- drop(contrast)
+ i <- contrast!=0
+ coef.name <- paste(paste(contrast[i],coef.names[i],sep="*"),collapse=" ")
+ design <- reform$design
+ }
+ beta <- as.vector(beta)
+
+# Null design matrix
+ design0 <- design[, -coef, drop=FALSE]
+
# Gene level information
gene.counts <- rowsum(fit.exon$counts, geneid, reorder=FALSE)
gene.dge <- DGEList(counts=gene.counts, genes=unique(geneid))
@@ -72,6 +112,7 @@ diffSpliceDGE <- function(fit.exon, coef=ncol(fit.exon$design), geneid, exonid=N
geneid <- geneid[exon.keep]
exon.genes <- exon.genes[exon.keep, , drop=FALSE]
fit.exon <- fit.exon[exon.keep, ]
+ beta <- beta[exon.keep]
fit.gene <- fit.gene[gene.keep, ]
gene.nexons <- gene.nexons[gene.keep]
@@ -85,7 +126,7 @@ diffSpliceDGE <- function(fit.exon, coef=ncol(fit.exon$design), geneid, exonid=N
gene.fit.exon <- glmFit(gene.counts.exon, design=design, dispersion=gene.dispersion.exon, lib.size=gene.dge$samples$lib.size)
gene.betabar <- gene.fit.exon$coefficients[, coef, drop=FALSE]
offset.new <- fit.exon$offset + gene.betabar %*% t(design[, coef, drop=FALSE])
- coefficients <- fit.exon$coefficients[, coef, drop=FALSE] - gene.betabar
+ coefficients <- beta - gene.betabar
# Testing
design0 <- design[, -coef, drop=FALSE]
@@ -101,15 +142,13 @@ diffSpliceDGE <- function(fit.exon, coef=ncol(fit.exon$design), geneid, exonid=N
gene.df.total <- pmin(gene.df.total, ngenes*max.df.residual)
exon.p.value <- pf(exon.F, df1=exon.df.test, df2=gene.df.total[g], lower.tail=FALSE, log.p=FALSE)
- #Ensure is not more significant than chisquare test
+# Ensure is not more significant than chisquare test
i <- s2.fit$var.post[gene.keep][g] < 1
if(any(i)) {
chisq.pvalue <- pchisq(exon.LR[i], df=exon.df.test[i], lower.tail=FALSE, log.p=FALSE)
exon.p.value[i] <- pmax(exon.p.value[i], chisq.pvalue)
}
-# Gene p-values
-
# Gene Simes' p-values
o <- order(g, exon.p.value, decreasing=FALSE)
p <- exon.p.value[o]
@@ -240,14 +279,3 @@ plotSpliceDGE <- function(lrt, geneid=NULL, rank=1L, FDR = 0.05)
}
abline(h=0,lty=2)
}
-
-
-
-
-
-
-
-
-
-
-
diff --git a/R/estimateDisp.R b/R/estimateDisp.R
index d98c87e..e418e8b 100644
--- a/R/estimateDisp.R
+++ b/R/estimateDisp.R
@@ -29,6 +29,7 @@ estimateDisp <- function(y, design=NULL, prior.df=NULL, trend.method="locfit", s
# Classic edgeR
if(is.null(design)){
# One group
+ cat("Design matrix not provided. Switch to the classic mode.\n")
group <- y$samples$group <- as.factor(y$samples$group)
if(length(levels(group))==1)
design <- matrix(1,nlibs,1)
diff --git a/R/estimateTrendedDisp.R b/R/estimateTrendedDisp.R
index 6d08bbe..47c0b36 100644
--- a/R/estimateTrendedDisp.R
+++ b/R/estimateTrendedDisp.R
@@ -1,6 +1,6 @@
estimateTrendedDisp <- function(object, method="bin.spline", df=5, span=2/3)
# Yunshun Chen, Gordon Smyth.
-# Created 2 Feb 2012, last modified on 4 Oct 2012.
+# Created 2 Feb 2012, last modified on 24 Sep 2015.
{
if( !is(object,"DGEList") ) stop("object must be a DGEList")
if( is.null(object$pseudo.counts) ) {
@@ -19,7 +19,7 @@ estimateTrendedDisp <- function(object, method="bin.spline", df=5, span=2/3)
for(i in 1:nbins) {
tagsinbin <- bins$group==i
- disp.bins[i] <- estimateCommonDisp(object[tagsinbin,])$common.dispersion
+ disp.bins[i] <- estimateCommonDisp(object[tagsinbin,],rowsum.filter=0)$common.dispersion
logCPM.bins[i] <- mean(logCPM[tagsinbin])
}
@@ -30,19 +30,16 @@ estimateTrendedDisp <- function(object, method="bin.spline", df=5, span=2/3)
r <- range(logCPM.bins)
knots2 <- r[1]+p1*(r[2]-r[1])
knots <- 0.3*knots1+0.7*knots2
- ind <- rep(NA, df+1)
- ind[1] <- which.min(logCPM.bins)
- ind[df+1] <- which.max(logCPM.bins)
- for(i in 2:df)
- ind[i] <- which.min(abs(knots[i-1]-logCPM.bins))
- fit <- lm(disp.bins ~ splines::ns(logCPM.bins, df=df, knots=knots))
- f <- splinefun(logCPM.bins[ind], fit$fitted.value[ind], method="natural")
- dispersion <- f(logCPM)
+ basisbins <- splines::ns(logCPM.bins,df=df,knots=knots,intercept=TRUE)
+ beta <- coefficients(lm.fit(basisbins, sqrt(disp.bins)))
+ basisall <- predict(basisbins,newx=logCPM)
+ dispersion <- drop(basisall %*% beta)^2
}
+
if( method=="bin.loess" ) {
- fit <- loessFit(disp.bins, logCPM.bins, span=span)
- f <- approxfun(logCPM.bins, fit$fitted, method="linear", rule=2)
- dispersion <- f(logCPM)
+ fit <- loessFit(sqrt(disp.bins), logCPM.bins, span=span, iterations=1)
+ f <- approxfun(logCPM.bins, fit$fitted, rule=2)
+ dispersion <- f(logCPM)^2
}
object$trended.dispersion <- dispersion
diff --git a/R/expandAsMatrix.R b/R/expandAsMatrix.R
index 0c76ddb..c71064b 100644
--- a/R/expandAsMatrix.R
+++ b/R/expandAsMatrix.R
@@ -1,20 +1,21 @@
-expandAsMatrix <- function(x,dim=NULL)
+expandAsMatrix <- function(x,dim=NULL,byrow=TRUE)
# Convert scalar, row or column vector, or matrix, to be a matrix
# Gordon Smyth
-# 26 Jan 2011. Last modified 26 Jan 2011.
+# 26 Jan 2011. Last modified 27 July 2015 (Yunshun Chen).
{
# Check dim argument
if(is.null(dim)) return(as.matrix(x))
if(length(dim)<2) stop("dim must be numeric vector of length 2")
dim <- round(dim[1:2])
- if(any(dim<1)) stop("zero or negative dimensions not allowed")
+ if(any(dim<0)) stop("negative dimensions not allowed")
# x is a vector
dx <- dim(x)
if(is.null(dx)) {
lx <- length(x)
if(lx==1) return(matrix(x,dim[1],dim[2]))
+ if(lx==dim[1] & lx==dim[2]) return(matrix(x,dim[1],dim[2],byrow=byrow))
if(lx==dim[2]) return(matrix(x,dim[1],dim[2],byrow=TRUE))
if(lx==dim[1]) return(matrix(x,dim[1],dim[2],byrow=FALSE))
stop("x of unexpected length")
diff --git a/R/geneset-DGEList.R b/R/geneset-DGEList.R
new file mode 100644
index 0000000..5335589
--- /dev/null
+++ b/R/geneset-DGEList.R
@@ -0,0 +1,80 @@
+roast.DGEList <- function(y, index=NULL, design=NULL, contrast=ncol(design), ...)
+# Rotation gene set testing for RNA-Seq data
+# Yunshun Chen, Gordon Smyth
+# Created 19 Dec 2012. Last revised on 27 May 2015
+{
+ y <- .zscoreDGE(y=y, design=design, contrast=contrast)
+ roast(y=y, index=index, design=design, contrast=contrast, var.prior=1, df.prior=Inf, ...)
+}
+
+mroast.DGEList <- function(y, index=NULL, design=NULL, contrast=ncol(design), ...)
+# Rotation gene set testing for RNA-Seq data with multiple sets
+# Yunshun Chen, Gordon Smyth
+# Created 8 Jan 2013. Last revised 27 May 2015.
+{
+ y <- .zscoreDGE(y=y, design=design, contrast=contrast)
+ mroast(y=y, index=index, design=design, contrast=contrast, var.prior=1, df.prior=Inf, ...)
+}
+
+camera.DGEList <- function(y, index, design=NULL, contrast=ncol(design), ...)
+# Rotation gene set testing for RNA-Seq data accounting for inter-gene correlation
+# Yunshun Chen, Gordon Smyth
+# Created 07 Jan 2013. Last modified 27 May 2015.
+{
+ y <- .zscoreDGE(y=y, design=design, contrast=contrast)
+ camera(y=y, index=index, design=design, contrast=contrast, ...)
+}
+
+romer.DGEList <- function(y, index, design=NULL, contrast=ncol(design), ...)
+# rotation mean-rank version of GSEA (gene set enrichment analysis) for RNA-Seq data
+# Yunshun Chen, Gordon Smyth
+# Created 20 Oct 2014. Last modified 27 May 2015.
+{
+ y <- .zscoreDGE(y=y, design=design, contrast=contrast)
+ romer(y=y, index=index, design=design, contrast, ...)
+}
+
+.zscoreDGE <- function(y, design=NULL, contrast=ncol(design))
+# Calculate NB z-scores given a DGEList object.
+# Yunshun Chen, Gordon Smyth
+# Created 27 May 2015. Last modified 19 June 2015.
+{
+# Check for all zero counts
+ allzero <- rowSums(y$counts>1e-8)==0
+ if(any(allzero)) warning(sum(allzero),"rows with all zero counts")
+
+# Check dispersion estimates in y
+ dispersion <- getDispersion(y)
+ if(is.null(dispersion)) stop("Dispersion estimate not found. Please estimate the dispersion(s) before you proceed.")
+
+# Make default design matrix from group factor
+ if(is.null(design)) {
+ if(nlevels(y$samples$group)<2) stop("design not supplied and samples all belong to the same group")
+ design <- model.matrix(~y$samples$group)
+ rownames(design) <- colnames(y)
+ }
+ nbeta <- ncol(design)
+ if(nbeta < 2) stop("design matrix must have at least two columns")
+
+# contrast could be a coef name
+ if(is.character(contrast)) {
+ if(length(contrast)>1) stop("contrast should specify only one column of design")
+ contrast <- which(contrast==colnames(design))
+ if(!length(contrast)) stop("contrast doesn't match any column of design")
+ }
+
+# Construct null hypothesis design matrix
+ if(length(contrast) == 1) {
+ design0 <- design[,-contrast,drop=FALSE]
+ } else {
+ design <- contrastAsCoef(design,contrast=contrast,first=FALSE)$design
+ design0 <- design[,-nbeta,drop=FALSE]
+ }
+
+# Null hypothesis fit
+ fit.null <- glmFit(y, design0, prior.count=0)
+
+# Quantile residuals from null fit
+ y <- zscoreNBinom(y$counts, mu=fit.null$fitted.values, size=1/dispersion)
+ y
+}
diff --git a/R/glmQLFTest.R b/R/glmQLFTest.R
index 4a8cabc..47aa94f 100644
--- a/R/glmQLFTest.R
+++ b/R/glmQLFTest.R
@@ -4,8 +4,7 @@ glmQLFit <- function(y, ...)
UseMethod("glmQLFit")
glmQLFit.DGEList <- function(y, design=NULL, dispersion=NULL, offset=NULL, abundance.trend=TRUE, robust=FALSE, winsor.tail.p=c(0.05, 0.1), ...)
-# Quasi-likelihood F-tests for DGE glms.
-# Yunshun Chen and Aaron Lun
+# Written by Yunshun Chen and Aaron Lun
# Created 05 November 2014.
{
if(is.null(dispersion)) {
@@ -25,9 +24,9 @@ glmQLFit.DGEList <- function(y, design=NULL, dispersion=NULL, offset=NULL, abund
new("DGEGLM",fit)
}
-glmQLFit.default <- function(y, design=NULL, dispersion=0.05, offset=NULL, lib.size=NULL, abundance.trend=TRUE, AveLogCPM=NULL, robust=FALSE, winsor.tail.p=c(0.05, 0.1), ...)
+glmQLFit.default <- function(y, design=NULL, dispersion=NULL, offset=NULL, lib.size=NULL, abundance.trend=TRUE, AveLogCPM=NULL, robust=FALSE, winsor.tail.p=c(0.05, 0.1), ...)
# Fits a GLM and computes quasi-likelihood dispersions for each gene.
-# Yunshun Chen and Aaron Lun, based on code by Davis McCarthy and Gordon Smyth
+# Written by Yunshun Chen and Aaron Lun, based on code by Davis McCarthy and Gordon Smyth
# Created 15 September 2014. Last modified 05 November 2014.
{
# Check y
@@ -36,7 +35,7 @@ glmQLFit.default <- function(y, design=NULL, dispersion=0.05, offset=NULL, lib.s
nlib <- ncol(y)
# Check dispersion
- if(is.null(dispersion)) dispersion <- 0.05
+ if(is.null(dispersion)) stop("No dispersion values provided.")
# Check offset and lib.size
if(is.null(offset)) {
@@ -109,7 +108,7 @@ glmQLFTest <- function(glmfit, coef=ncol(glmfit$design), contrast=NULL)
plotQLDisp <- function(glmfit, xlab="Average Log2 CPM", ylab="Quarter-Root Mean Deviance", pch=16, cex=0.2, col.shrunk="red", col.trend="blue", col.raw="black", ...)
# Plots the result of QL-based shrinkage.
-# Aaron Lun, based on code by Davis McCarthy and Gordon Smyth
+# written by Aaron Lun, based on code by Davis McCarthy and Gordon Smyth
# 15 September 2014
{
A <- glmfit$AveLogCPM
diff --git a/R/glmTreat.R b/R/glmTreat.R
index df8d2f4..fa70ec3 100644
--- a/R/glmTreat.R
+++ b/R/glmTreat.R
@@ -5,11 +5,25 @@ treatDGE <- function(glmfit, coef=ncol(glmfit$design), contrast=NULL, lfc=0)
}
glmTreat <- function(glmfit, coef=ncol(glmfit$design), contrast=NULL, lfc=0)
-# Likelihood ratio test or quasi-likelihood F-test with threshold
+# Likelihood ratio test or quasi-likelihood F-test with a threshold
# Yunshun Chen and Gordon Smyth
-# Created on 05 May 2014. Last modified on 25 Mar 2015
+# Created on 05 May 2014. Last modified on 29 April 2015
{
if(lfc < 0) stop("lfc has to be non-negative")
+
+# Check if glmfit is from glmFit() or glmQLFit()
+ isLRT <- is.null(glmfit$df.prior)
+
+# Switch to glmLRT() or glmQLFTest() if lfc is zero
+ if(lfc==0) {
+ fun <- ifelse(isLRT, "glmLRT", "glmQLFTest")
+ cat( paste0("Zero log2-FC threshold detected. Switch to ", fun, "() instead."), "\n" )
+ return( do.call(fun, args=list(glmfit, coef, contrast)) )
+ }
+
+# Check if there is any log-FC shrinkage
+ shrunk <- glmfit$prior.count!=0
+
# Check glmfit
if(!is(glmfit,"DGEGLM")) {
if(is(glmfit,"DGEList") && is(coef,"DGEGLM")) {
@@ -20,22 +34,12 @@ glmTreat <- function(glmfit, coef=ncol(glmfit$design), contrast=NULL, lfc=0)
if(is.null(glmfit$AveLogCPM)) glmfit$AveLogCPM <- aveLogCPM(glmfit)
nlibs <- ncol(glmfit)
ngenes <- nrow(glmfit)
-
-# Check if glmfit is from glmFit() or glmQLFit()
- isLRT <- is.null(glmfit$df.prior)
- if(isLRT) fun <- glmFit else fun <- glmQLFit
# Check design matrix
design <- as.matrix(glmfit$design)
nbeta <- ncol(design)
if(nbeta < 2) stop("Need at least two columns for design, usually the first is the intercept column")
coef.names <- colnames(design)
-
- if(glmfit$prior.count!=0){
- coefficients.mle <- glmfit$unshrunk.coefficients
- } else {
- coefficients.mle <- glmfit$coefficients
- }
# Evaluate logFC for coef to be tested
# Note that contrast takes precedence over coef: if contrast is given
@@ -50,68 +54,115 @@ glmTreat <- function(glmfit, coef=ncol(glmfit$design), contrast=NULL, lfc=0)
}
else
coef.name <- coef.names[coef]
- logFC <- coefficients.mle[, coef, drop=FALSE]/log(2)
+ unshrunk.logFC <- glmfit$coefficients[, coef, drop=FALSE]/log(2)
+ if(shrunk) {
+ logFC <- as.vector(unshrunk.logFC)
+ unshrunk.logFC <- glmfit$unshrunk.coefficients[, coef, drop=FALSE]/log(2)
+ }
} else {
contrast <- as.matrix(contrast)
reform <- contrastAsCoef(design, contrast=contrast, first=TRUE)
coef <- 1
- logFC <- drop((coefficients.mle %*% contrast)/log(2))
+ unshrunk.logFC <- drop((glmfit$coefficients %*% contrast)/log(2))
+ if(shrunk) {
+ logFC <- as.vector(unshrunk.logFC)
+ unshrunk.logFC <- drop((glmfit$unshrunk.coefficients %*% contrast)/log(2))
+ }
contrast <- drop(contrast)
i <- contrast!=0
coef.name <- paste(paste(contrast[i],coef.names[i],sep="*"),collapse=" ")
design <- reform$design
}
- logFC <- as.vector(logFC)
+ unshrunk.logFC <- as.vector(unshrunk.logFC)
+ up <- unshrunk.logFC >= 0
# Null design matrix
design0 <- design[, -coef, drop=FALSE]
-# Test statistics at beta_0 = zero
- fit0.zero <- fun(glmfit$counts, design=design0, offset=glmfit$offset, weights=glmfit$weights, dispersion=glmfit$dispersion, prior.count=0)
- X2.zero <- pmax(0, fit0.zero$deviance - glmfit$deviance)
+ if(isLRT) {
+# Adjusted offset
+ offset.adj <- matrix(-lfc*log(2), ngenes, 1)
+ offset.adj[up, ] <- lfc*log(2)
+
+# Test statistics at beta_0 = tau
+ offset.new <- glmfit$offset + offset.adj %*% t(design[, coef, drop=FALSE])
+ fit0.tau <- glmFit(glmfit$counts, design=design0, offset=offset.new, weights=glmfit$weights, dispersion=glmfit$dispersion, prior.count=0)
+ fit1.tau <- glmFit(glmfit$counts, design=design, offset=offset.new, weights=glmfit$weights, dispersion=glmfit$dispersion, prior.count=0)
+ X2.tau <- pmax(0, fit0.tau$deviance - fit1.tau$deviance)
+ z.tau <- sqrt(X2.tau)
+
+# Test statistics at beta_0 = -tau
+ offset.new <- glmfit$offset - offset.adj %*% t(design[, coef, drop=FALSE])
+ fit0.tau <- glmFit(glmfit$counts, design=design0, offset=offset.new, weights=glmfit$weights, dispersion=glmfit$dispersion, prior.count=0)
+ fit1.tau <- glmFit(glmfit$counts, design=design, offset=offset.new, weights=glmfit$weights, dispersion=glmfit$dispersion, prior.count=0)
+ X2.tau2 <- pmax(0, fit0.tau$deviance - fit1.tau$deviance)
+ z.tau2 <- sqrt(X2.tau2)
-# Test statistics at beta_0 = tau
- offset.adj <- matrix(-lfc*log(2), ngenes, 1)
- up <- logFC >= 0
- offset.adj[up, ] <- lfc*log(2)
- offset.new <- glmfit$offset + offset.adj %*% t(design[, coef, drop=FALSE])
- fit0.tau <- fun(glmfit$counts, design=design0, offset=offset.new, weights=glmfit$weights, dispersion=glmfit$dispersion, prior.count=0)
- fit1.tau <- fun(glmfit$counts, design=design, offset=offset.new, weights=glmfit$weights, dispersion=glmfit$dispersion, prior.count=0)
- X2.tau <- pmax(0, fit0.tau$deviance - fit1.tau$deviance)
+ within <- abs(unshrunk.logFC) <= lfc
+ sgn <- 2*within - 1
- within <- abs(logFC) <= lfc
- sgn <- 2*within - 1
- z.zero <- sqrt(X2.zero)
- z.tau <- sqrt(X2.tau)
-
- if(isLRT){
- p.value <- pnorm( z.tau*sgn ) + pnorm( -z.tau*sgn-2*z.zero )
+# Integral of Normal CDF from a to b
+ fun <- function(a,b) ifelse(a==b, pnorm(a), ( b*pnorm(b)+dnorm(b) - (a*pnorm(a)+dnorm(a)) )/(b-a) )
+ p.value <- 2*fun(-z.tau2, z.tau*sgn)
} else {
- t.zero <- z.zero/sqrt(glmfit$var.post)
- t.tau <- z.tau/sqrt(glmfit$var.post)
+# Guass quadrature
+ if(!requireNamespace("statmod",quietly=TRUE)) stop("statmod required but is not available")
+ nnodes <- 5
+ gquad <- statmod::gauss.quad.prob(nnodes, dist="uniform", l=0, u=lfc*log(2))
+
+ offset.adj <- X2.pos <- X2.neg <- matrix(-gquad$nodes, ngenes, nnodes, byrow=TRUE)
+ offset.adj[up, ] <- -offset.adj[up, ]
+
+ for(k in 1:nnodes){
+# Test statistics at beta_0 = pos_nodes (tau)
+ offset.new <- glmfit$offset + offset.adj[,k] %*% t(design[, coef, drop=FALSE])
+ fit0.pos <- glmQLFit(glmfit$counts, design=design0, offset=offset.new, weights=glmfit$weights, dispersion=glmfit$dispersion, prior.count=0)
+ fit1.pos <- glmQLFit(glmfit$counts, design=design, offset=offset.new, weights=glmfit$weights, dispersion=glmfit$dispersion, prior.count=0)
+ X2.pos[,k] <- pmax(0, fit0.pos$deviance - fit1.pos$deviance)
+
+# Test statistics at beta_0 = neg_nodes (-tau)
+ offset.new <- glmfit$offset - offset.adj[,k] %*% t(design[, coef, drop=FALSE])
+ fit0.neg <- glmQLFit(glmfit$counts, design=design0, offset=offset.new, weights=glmfit$weights, dispersion=glmfit$dispersion, prior.count=0)
+ fit1.neg <- glmQLFit(glmfit$counts, design=design, offset=offset.new, weights=glmfit$weights, dispersion=glmfit$dispersion, prior.count=0)
+ X2.neg[,k] <- pmax(0, fit0.neg$deviance - fit1.neg$deviance)
+ }
+ z.pos <- sqrt(X2.pos)
+ z.neg <- sqrt(X2.neg)
+
+ within <- matrix(abs(unshrunk.logFC)*log(2), ngenes, nnodes) <= abs(offset.adj)
+ sgn <- 2*within - 1
+
+# Calculate expected p-values by Gauss quadrature
+ t.pos <- z.pos/sqrt(glmfit$var.post)
+ t.neg <- z.neg/sqrt(glmfit$var.post)
df.total <- glmfit$df.prior + glmfit$df.residual.zeros
max.df.residual <- ncol(glmfit$counts)-ncol(glmfit$design)
df.total <- pmin(df.total, nrow(glmfit)*max.df.residual)
- p.value <- pt( t.tau*sgn, df=df.total ) + pt( -t.tau*sgn-2*t.zero, df=df.total )
+ p.value <- ( pt(t.pos*sgn, df=df.total) + pt(t.neg, df=df.total, lower.tail=FALSE) ) %*% gquad$weights
- # Ensure is not more significant than z-test
+# Ensure is not more significant than z-test
i <- glmfit$var.post < 1
if(any(i)) {
- z.pvalue <- pnorm( z.tau[i]*sgn[i] ) + pnorm( -z.tau[i]*sgn[i]-2*z.zero[i] )
+ z.pvalue <- ( pnorm(z.pos[i,]*sgn[i,]) + pnorm(z.neg[i,], lower.tail=FALSE) ) %*% gquad$weights
p.value[i] <- pmax(p.value[i], z.pvalue)
- }
+ }
}
+# Table output
tab <- data.frame(
- logFC=logFC,
logCPM=glmfit$AveLogCPM,
PValue=p.value,
row.names=rownames(glmfit)
)
+ if(shrunk) {
+ tab <- cbind(logFC=logFC, unshrunk.logFC=unshrunk.logFC, tab)
+ } else {
+ tab <- cbind(logFC=unshrunk.logFC, tab)
+ }
+
glmfit$lfc <- lfc
glmfit$counts <- NULL
- glmfit$table <- tab
+ glmfit$table <- tab
glmfit$comparison <- coef.name
new("DGELRT",unclass(glmfit))
}
-
diff --git a/R/glmfit.R b/R/glmfit.R
index cfe1028..785e74c 100644
--- a/R/glmfit.R
+++ b/R/glmfit.R
@@ -23,7 +23,7 @@ glmFit.default <- function(y, design=NULL, dispersion=NULL, offset=NULL, lib.siz
# Fit negative binomial generalized linear model for each transcript
# to a series of digital expression libraries
# Davis McCarthy and Gordon Smyth
-# Created 17 August 2010. Last modified 11 Sep 2014.
+# Created 17 August 2010. Last modified 18 May 2015.
{
# Check y
y <- as.matrix(y)
@@ -43,6 +43,7 @@ glmFit.default <- function(y, design=NULL, dispersion=NULL, offset=NULL, lib.siz
# Check dispersion
if(is.null(dispersion)) stop("No dispersion values provided.")
+ dispersion.mat <- expandAsMatrix(dispersion, dim(y), byrow=FALSE)
# Check offset and lib.size
if(is.null(offset)) {
@@ -57,11 +58,11 @@ glmFit.default <- function(y, design=NULL, dispersion=NULL, offset=NULL, lib.siz
# If the design is equivalent to a oneway layout, use a shortcut algorithm
group <- designAsFactor(design)
if(nlevels(group)==ncol(design)) {
- fit <- mglmOneWay(y,design=design,dispersion=dispersion,offset=offset,weights=weights,coef.start=start)
- fit$deviance <- nbinomDeviance(y=y,mean=fit$fitted.values,dispersion=dispersion,weights=weights)
+ fit <- mglmOneWay(y,design=design,dispersion=dispersion.mat,offset=offset,weights=weights,coef.start=start)
+ fit$deviance <- nbinomDeviance(y=y,mean=fit$fitted.values,dispersion=dispersion.mat,weights=weights)
fit$method <- "oneway"
} else {
- fit <- mglmLevenberg(y,design=design,dispersion=dispersion,offset=offset,weights=weights,coef.start=start,maxit=250)
+ fit <- mglmLevenberg(y,design=design,dispersion=dispersion.mat,offset=offset,weights=weights,coef.start=start,maxit=250)
fit$method <- "levenberg"
}
@@ -71,7 +72,7 @@ glmFit.default <- function(y, design=NULL, dispersion=NULL, offset=NULL, lib.siz
fit$unshrunk.coefficients <- fit$coefficients
colnames(fit$unshrunk.coefficients) <- colnames(design)
rownames(fit$unshrunk.coefficients) <- rownames(y)
- fit$coefficients <- predFC(y,design,offset=offset,dispersion=dispersion,prior.count=prior.count,weights=weights,...)*log(2)
+ fit$coefficients <- predFC(y,design,offset=offset,dispersion=dispersion.mat,prior.count=prior.count,weights=weights,...)*log(2)
}
colnames(fit$coefficients) <- colnames(design)
rownames(fit$coefficients) <- rownames(y)
@@ -87,10 +88,10 @@ glmFit.default <- function(y, design=NULL, dispersion=NULL, offset=NULL, lib.siz
}
-glmLRT <- function(glmfit,coef=ncol(glmfit$design),contrast=NULL,test="chisq")
+glmLRT <- function(glmfit,coef=ncol(glmfit$design),contrast=NULL)
# Tagwise likelihood ratio tests for DGEGLM
# Gordon Smyth, Davis McCarthy and Yunshun Chen.
-# Created 1 July 2010. Last modified 22 Nov 2013.
+# Created 1 July 2010. Last modified 11 June 2015.
{
# Check glmfit
if(!is(glmfit,"DGEGLM")) {
@@ -101,10 +102,6 @@ glmLRT <- function(glmfit,coef=ncol(glmfit$design),contrast=NULL,test="chisq")
}
if(is.null(glmfit$AveLogCPM)) glmfit$AveLogCPM <- aveLogCPM(glmfit)
nlibs <- ncol(glmfit)
-
-# Check test
- test <- match.arg(test,c("F","f","chisq"))
- if(test=="f") test <- "F"
# Check design matrix
design <- as.matrix(glmfit$design)
@@ -157,20 +154,7 @@ glmLRT <- function(glmfit,coef=ncol(glmfit$design),contrast=NULL,test="chisq")
# Likelihood ratio statistic
LR <- fit.null$deviance - glmfit$deviance
df.test <- fit.null$df.residual - glmfit$df.residual
-
-# Chisquare or F-test
- LRT.pvalue <- switch(test,
- "F" = {
- phi <- quantile(glmfit$dispersion,p=0.5)
- mu <- quantile(glmfit$fitted.values,p=0.5)
- gamma.prop <- (phi*mu/(1 + phi*mu))^2
- prior.df <- glmfit$prior.df
- if(is.null(prior.df)) prior.df <- 20
- glmfit$df.total <- glmfit$df.residual + prior.df/gamma.prop
- pf(LR/df.test, df1=df.test, df2=glmfit$df.total, lower.tail = FALSE, log.p = FALSE)
- },
- "chisq" = pchisq(LR, df=df.test, lower.tail = FALSE, log.p = FALSE)
- )
+ LRT.pvalue <- pchisq(LR, df=df.test, lower.tail = FALSE, log.p = FALSE)
rn <- rownames(glmfit)
if(is.null(rn))
diff --git a/R/goana.R b/R/goana.R
index 10b14d0..68a1974 100644
--- a/R/goana.R
+++ b/R/goana.R
@@ -1,32 +1,35 @@
-goana.DGELRT <- function(de, geneid = rownames(de), FDR = 0.05, species = "Hs", trend = FALSE, ...)
+goana.DGELRT <- function(de, geneid = rownames(de), FDR = 0.05, trend = FALSE, ...)
# Gene ontology analysis of DE genes from linear model fit
# Gordon Smyth, Yifang Hu and Yunshun Chen
-# Created 25 August 2014. Last modified 6 April 2015.
+# Created 25 August 2014. Last modified 4 June 2015.
{
- # Check fit
- if(is.null(de$table$PValue)) stop("p values not found in fit object")
+# Avoid argument collision with default method
+ dots <- names(list(...))
+ if("universe" %in% dots) stop("goana.DGELRT defines its own universe",call.=FALSE)
+ if((!is.logical(trend) || trend) && "covariate" %in% dots) stop("goana.DGELRT defines it own covariate",call.=FALSE)
ngenes <- nrow(de)
- # Check geneid
- # Can be either a vector of IDs or a column name
+# Check geneid
+# Can be either a vector of gene IDs or an annotation column name
geneid <- as.character(geneid)
if(length(geneid) == ngenes) {
universe <- geneid
- } else
+ } else {
if(length(geneid) == 1L) {
universe <- de$genes[[geneid]]
- if(is.null(universe)) stop(paste("Column",geneid,"not found in de$genes"))
+ if(is.null(universe)) stop("Column ",geneid," not found in de$genes")
} else
- stop("geneid has incorrect length")
+ stop("geneid of incorrect length")
+ }
- # Check trend
- # Can be logical, or a numeric vector of covariate values, or the name of the column containing the covariate values
+# Check trend
+# Can be logical, or a numeric vector of covariate values, or the name of the column containing the covariate values
if(is.logical(trend)) {
if(trend) {
covariate <- de$table$logCPM
if(is.null(covariate)) stop("logCPM not found in fit object")
}
- } else
+ } else {
if(is.numeric(trend)) {
if(length(trend) != ngenes) stop("If trend is numeric, then length must equal nrow(de)")
covariate <- trend
@@ -35,32 +38,34 @@ goana.DGELRT <- function(de, geneid = rownames(de), FDR = 0.05, species = "Hs",
if(is.character(trend)) {
if(length(trend) != 1L) stop("If trend is character, then length must be 1")
covariate <- de$genes[[trend]]
- if(is.null(covariate)) stop(paste("Column",trend,"not found in de$genes"))
+ if(is.null(covariate)) stop("Column ",trend," not found in de$genes")
trend <- TRUE
} else
stop("trend is neither logical, numeric nor character")
}
+ }
- # Check FDR
+# Check FDR
if(!is.numeric(FDR) | length(FDR) != 1) stop("FDR must be numeric and of length 1.")
if(FDR < 0 | FDR > 1) stop("FDR should be between 0 and 1.")
- # Get up and down DE genes
+# Get up and down DE genes
fdr.coef <- p.adjust(de$table$PValue, method = "BH")
EG.DE.UP <- universe[fdr.coef < FDR & de$table$logFC > 0]
EG.DE.DN <- universe[fdr.coef < FDR & de$table$logFC < 0]
- de.gene <- list(Up=EG.DE.UP, Down=EG.DE.DN)
+ DEGenes <- list(Up=EG.DE.UP, Down=EG.DE.DN)
- # Fit monotonic cubic spline for DE genes vs. gene.weights
- if(trend) {
- PW <- isDE <- rep(0,ngenes)
- isDE[fdr.coef < FDR] <- 1
- o <- order(covariate)
- PW[o] <- tricubeMovingAverage(isDE[o],span=0.5,full.length=TRUE)
+# If no DE genes, return data.frame with 0 rows
+ if(length(EG.DE.UP)==0 && length(EG.DE.DN)==0) {
+ message("No DE genes")
+ return(data.frame())
}
- if(!trend) PW <- NULL
- goana(de = de.gene, universe = universe, species = species, prior.prob = PW, ...)
+ if(trend)
+ goana(de=DEGenes, universe = universe, covariate=covariate, ...)
+ else
+ goana(de=DEGenes, universe = universe, ...)
}
+
goana.DGEExact <- goana.DGELRT
diff --git a/R/kegga.R b/R/kegga.R
new file mode 100644
index 0000000..5549e1a
--- /dev/null
+++ b/R/kegga.R
@@ -0,0 +1,71 @@
+kegga.DGELRT <- function(de, geneid = rownames(de), FDR = 0.05, trend = FALSE, ...)
+# KEGG analysis of DE genes from linear model fit
+# Gordon Smyth
+# Created 4 June 2015. Last modified 4 June 2015.
+{
+# Avoid argument collision with default method
+ dots <- names(list(...))
+ if("universe" %in% dots) stop("kegga.DGELRT defines its own universe",call.=FALSE)
+ if((!is.logical(trend) || trend) && "covariate" %in% dots) stop("kegga.DGELRT defines it own covariate",call.=FALSE)
+ ngenes <- nrow(de)
+
+# Check geneid
+# Can be either a vector of gene IDs or an annotation column name
+ geneid <- as.character(geneid)
+ if(length(geneid) == ngenes) {
+ universe <- geneid
+ } else {
+ if(length(geneid) == 1L) {
+ universe <- de$genes[[geneid]]
+ if(is.null(universe)) stop("Column ",geneid," not found in de$genes")
+ } else
+ stop("geneid of incorrect length")
+ }
+
+# Check trend
+# Can be logical, or a numeric vector of covariate values, or the name of the column containing the covariate values
+ if(is.logical(trend)) {
+ if(trend) {
+ covariate <- de$table$logCPM
+ if(is.null(covariate)) stop("logCPM not found in fit object")
+ }
+ } else {
+ if(is.numeric(trend)) {
+ if(length(trend) != ngenes) stop("If trend is numeric, then length must equal nrow(de)")
+ covariate <- trend
+ trend <- TRUE
+ } else {
+ if(is.character(trend)) {
+ if(length(trend) != 1L) stop("If trend is character, then length must be 1")
+ covariate <- de$genes[[trend]]
+ if(is.null(covariate)) stop("Column ",trend," not found in de$genes")
+ trend <- TRUE
+ } else
+ stop("trend is neither logical, numeric nor character")
+ }
+ }
+
+# Check FDR
+ if(!is.numeric(FDR) | length(FDR) != 1) stop("FDR must be numeric and of length 1.")
+ if(FDR < 0 | FDR > 1) stop("FDR should be between 0 and 1.")
+
+# Get up and down DE genes
+ fdr.coef <- p.adjust(de$table$PValue, method = "BH")
+ EG.DE.UP <- universe[fdr.coef < FDR & de$table$logFC > 0]
+ EG.DE.DN <- universe[fdr.coef < FDR & de$table$logFC < 0]
+ DEGenes <- list(Up=EG.DE.UP, Down=EG.DE.DN)
+
+# If no DE genes, return data.frame with 0 rows
+ if(length(EG.DE.UP)==0 && length(EG.DE.DN)==0) {
+ message("No DE genes")
+ return(data.frame())
+ }
+
+ if(trend)
+ kegga(de=DEGenes, universe = universe, covariate=covariate, ...)
+ else
+ kegga(de=DEGenes, universe = universe, ...)
+}
+
+
+kegga.DGEExact <- kegga.DGELRT
diff --git a/R/mglmLevenberg.R b/R/mglmLevenberg.R
index b39bfef..7044fee 100644
--- a/R/mglmLevenberg.R
+++ b/R/mglmLevenberg.R
@@ -16,7 +16,7 @@ mglmLevenberg <- function(y, design, dispersion=0, offset=0, weights=NULL, coef.
if(!( all(is.finite(y)) || all(is.finite(design)) )) stop("All values must be finite and non-missing")
design <- as.matrix(design)
- if(length(dispersion)<ngenes) dispersion <- rep(dispersion,length.out=ngenes)
+ dispersion <- expandAsMatrix(dispersion, dim(y), byrow=FALSE)
if(is.null(coef.start)) {
start.method <- match.arg(start.method, c("null","y"))
@@ -59,7 +59,7 @@ mglmLevenberg <- function(y, design, dispersion=0, offset=0, weights=NULL, coef.
if (!is.double(weights)) storage.mode(weights) <- "double"
if (!is.double(beta)) storage.mode(beta) <- "double"
if (!is.double(mu)) storage.mode(mu) <- "double"
- output <- .Call(.cR_levenberg, nlibs, ngenes, design, t(y), dispersion, offset, weights, beta, mu, tol, maxit)
+ output <- .Call(.cR_levenberg, nlibs, ngenes, design, t(y), t(dispersion), offset, weights, beta, mu, tol, maxit)
# Check for error condition
if (is.character(output)) { stop(output) }
diff --git a/R/mglmOneGroup.R b/R/mglmOneGroup.R
index 1d05252..036d651 100644
--- a/R/mglmOneGroup.R
+++ b/R/mglmOneGroup.R
@@ -11,7 +11,7 @@ mglmOneGroup <- function(y,dispersion=0,offset=0,weights=NULL,maxit=50,tol=1e-10
nlibs <- ncol(y)
# Check dispersion
- dispersion <- as.vector(dispersion)
+ dispersion <- expandAsMatrix(dispersion, dim(y), byrow=FALSE)
if(typeof(dispersion) != "double") stop("dispersion not floating point number")
if(any(dispersion<0)) stop("dispersion must be non-negative")
@@ -37,12 +37,11 @@ mglmOneGroup <- function(y,dispersion=0,offset=0,weights=NULL,maxit=50,tol=1e-10
if(typeof(weights) != "double") stop("weights is non-numeric")
# Expansions to full dimensions
- dispersion <- rep(dispersion,length=ntags)
offset <- expandAsMatrix(offset,dim(y))
weights <- expandAsMatrix(weights,dim(y))
# Fisher scoring iteration.
- output <- .Call(.cR_one_group, nlibs, ntags, t(y), dispersion, offset, weights, maxit, tol, coef.start)
+ output <- .Call(.cR_one_group, nlibs, ntags, t(y), t(dispersion), offset, weights, maxit, tol, coef.start)
# Check error condition
if(is.character(output)) stop(output)
diff --git a/R/mglmOneWay.R b/R/mglmOneWay.R
index c0cec77..8641a58 100644
--- a/R/mglmOneWay.R
+++ b/R/mglmOneWay.R
@@ -31,6 +31,7 @@ mglmOneWay <- function(y,design=NULL,dispersion=0,offset=0,weights=NULL,maxit=50
stopifnot(ncol(design)==ngroups)
mu <- matrix(0,ntags,ngroups)
offset <- expandAsMatrix(offset,dim(y))
+ dispersion <- expandAsMatrix(dispersion, dim(y), byrow=FALSE)
if(!is.null(weights)) weights <- expandAsMatrix(weights,dim(y))
firstjofgroup <- rep(0,ngroups)
@@ -39,7 +40,7 @@ mglmOneWay <- function(y,design=NULL,dispersion=0,offset=0,weights=NULL,maxit=50
j <- which(group==(levels(group)[g]))
firstjofgroup[g] <- j[1]
if (!is.null(coef.start)) { new.start <- coef.start %*% design[firstjofgroup[g],] }
- mu[,g] <- mglmOneGroup(y[,j,drop=FALSE],dispersion=dispersion,offset=offset[,j,drop=FALSE],weights=weights[,j,drop=FALSE],maxit=maxit,tol=tol,
+ mu[,g] <- mglmOneGroup(y[,j,drop=FALSE],dispersion=dispersion[,j,drop=FALSE],offset=offset[,j,drop=FALSE],weights=weights[,j,drop=FALSE],maxit=maxit,tol=tol,
coef.start=new.start)
}
diff --git a/R/plotMD.R b/R/plotMD.R
new file mode 100644
index 0000000..dcbc5ac
--- /dev/null
+++ b/R/plotMD.R
@@ -0,0 +1,48 @@
+plotMD.DGEList <- function(object, column=1, xlab="Average log CPM (this sample and others)", ylab="log-ratio (this sample vs others)", main=colnames(object)[column], status=object$genes$Status, zero.weights=FALSE, prior.count=3, ...)
+# Mean-difference plot with color coding for controls
+# Gordon Smyth
+# Created 24 June 2015. Last modified 24 June 2015.
+{
+ nlib <- ncol(object)
+ if(nlib < 2L) stop("Need at least two columns")
+
+# Convert column to integer if not already
+ j <- 1:nlib
+ names(j) <- colnames(object)
+ column <- j[column[1]]
+
+ logCPM <- cpm(object, log=TRUE, prior.count=prior.count)
+ AveOfOthers <- rowMeans(logCPM[,-column,drop=FALSE],na.rm=TRUE)
+ Diff <- logCPM[,column]-AveOfOthers
+ Mean <- (logCPM[,column]+AveOfOthers)/2
+
+ if(!zero.weights && !is.null(object$weights)) {
+ w <- as.matrix(object$weights)[,column]
+ Diff[ is.na(w) | (w <= 0) ] <- NA
+ }
+
+ plotWithHighlights(x=Mean,y=Diff,xlab=xlab,ylab=ylab,main=main,status=status,...)
+}
+
+plotMD.DGEGLM <- function(object, column=ncol(object), coef=NULL, xlab="Average log CPM", ylab="log-fold-change", main=colnames(object)[column], status=object$genes$Status, zero.weights=FALSE, ...)
+# Mean-difference plot with color coding for controls
+# Gordon Smyth
+# Created 24 June 2015. Last modified 24 June 2015.
+{
+ if(!is.null(coef)) column <- coef
+ if(is.null(object$AveLogCPM)) stop("AveLogCPM component is absent.")
+ logFC <- as.matrix(object$coefficients)[,column]
+ if(!zero.weights && !is.null(object$weights)) {
+ w <- as.matrix(object$weights)[,column]
+ logFC[ is.na(w) | (w <= 0) ] <- NA
+ }
+ plotWithHighlights(x=object$AveLogCPM,y=logFC,xlab=xlab,ylab=ylab,main=main,status=status,...)
+}
+
+plotMD.DGEExact <- plotMD.DGELRT <- function(object, xlab="Average log CPM", ylab="log-fold-change", main=object$comparison, status=object$genes$Status, ...)
+# Mean-difference plot with color coding for controls
+# Gordon Smyth
+# Last modified 24 June 2015.
+{
+ plotWithHighlights(x=object$table$logCPM,y=object$table$logFC,xlab=xlab,ylab=ylab,main=main,status=status,...)
+}
diff --git a/R/readDGE.R b/R/readDGE.R
index 34e81ff..dccf58c 100644
--- a/R/readDGE.R
+++ b/R/readDGE.R
@@ -1,40 +1,54 @@
readDGE <- function(files,path=NULL,columns=c(1,2),group=NULL,labels=NULL,...)
# Read and collate a set of count data files, each file containing counts for one library
-# Created 2006. Last modified 15 August 2014.
+# Created 2006. Last modified 15 June 2015.
{
+# Create data.frame to hold sample information
x <- list()
if(is.data.frame(files)) {
x$samples <- files
- if(is.null(labels)) labels <- row.names(files)
- files <- files$files
+ if(!is.null(labels)) row.names(x$samples) <- labels
+ if(is.null(x$samples$files)) stop("file names not found")
+ x$samples$files <- as.character(x$samples$files)
} else {
- x$samples <- data.frame(files=as.character(files),group=1,stringsAsFactors=FALSE)
+ files <- as.character(files)
+ if(is.null(labels)) labels <- removeExt(files)
+ x$samples <- data.frame(files=files,row.names=labels,stringsAsFactors=FALSE)
}
+ nfiles <- nrow(x$samples)
+
+# Set group factor
if(!is.null(group)) x$samples$group <- group
- if(!is.null(x$samples$group)) x$samples$group <- as.factor(x$samples$group)
+ if(is.null(x$samples$group)) x$samples$group <- rep(1,nfiles)
+ x$samples$group <- as.factor(x$samples$group)
+
+# Read files
d <- taglist <- list()
- for (fn in files) {
+ for (fn in x$samples$files) {
if(!is.null(path)) fn <- file.path(path,fn)
d[[fn]] <- read.delim(fn,...,stringsAsFactors=FALSE)
taglist[[fn]] <- as.character(d[[fn]][,columns[1]])
- if(any(duplicated(taglist[[fn]]))) {
+ if(anyDuplicated(taglist[[fn]])) {
stop(paste("Repeated tag sequences in",fn))
}
}
+
+# Collate counts for unique tags
tags <- unique(unlist(taglist))
ntags <- length(tags)
- nfiles <- length(files)
x$counts <- matrix(0,ntags,nfiles)
- rownames(x$counts) <- tags
- colnames(x$counts) <- labels
- if(is.null(colnames(x$counts))) colnames(x$counts) <- removeExt(files)
+ dimnames(x$counts) <- list(Tags=tags,Samples=row.names(x$samples))
for (i in 1:nfiles) {
aa <- match(taglist[[i]],tags)
x$counts[aa,i] <- d[[i]][,columns[2]]
}
+
+# Enter library sizes and norm factors
x$samples$lib.size <- colSums(x$counts)
x$samples$norm.factors <- 1
- row.names(x$samples) <- colnames(x$counts)
- x$genes <- NULL
+
+# Alert user if htseq style meta genes found
+ MetaTags <- grep("^__",tags,value=TRUE)
+ if(length(MetaTags)) message("Meta tags detected: ",paste(MetaTags,collapse=", "))
+
new("DGEList",x)
}
diff --git a/R/roast.DGEList.R b/R/roast.DGEList.R
deleted file mode 100644
index c7e2828..0000000
--- a/R/roast.DGEList.R
+++ /dev/null
@@ -1,82 +0,0 @@
-roast.DGEList <- function(y, index=NULL, design=NULL, contrast=ncol(design), ...)
-# Rotation gene set testing for RNA-Seq data
-# Yunshun Chen, Gordon Smyth
-# Created 19 Dec 2012. Last revised on 28 Feb 2014
-{
-# Check dispersion estimates in y
- dispersion <- getDispersion(y)
- if(is.null(dispersion)) stop("Dispersion estimate not found. Please estimate the dispersion(s) before you proceed.")
-
-# Make default design matrix from group factor
- if(is.null(design)) {
- if(nlevels(y$samples$group)<2) stop("design not supplied and samples all belong to the same group")
- design <- model.matrix(~y$samples$group)
- rownames(design) <- colnames(y)
- }
- nbeta <- ncol(design)
- if(nbeta < 2) stop("design matrix must have at least two columns")
-
-# Check contrast
- if(length(contrast) == 1) {
- u <- rep.int(0, nbeta)
- u[contrast] <- 1
- contrast <- u
- }
- if(length(contrast) != nbeta) stop("length of contrast must match column dimension of design")
- if(all(contrast==0)) stop("contrast all zero")
-
-# Construct null hypothesis design matrix
- QR <- qr(contrast)
- design0 <- t(qr.qty(QR, t(design))[-1, , drop=FALSE])
-
-# Null hypothesis fit
- fit.null <- glmFit(y, design0, prior.count=0)
-
-# Quantile residuals from null fit
- y <- zscoreNBinom(y$counts, mu=fit.null$fitted.values, size=1/dispersion)
-
- NextMethod("roast")
-}
-
-
-
-mroast.DGEList <- function(y, index=NULL, design=NULL, contrast=ncol(design), ...)
-# Rotation gene set testing for RNA-Seq data with multiple sets
-# Yunshun Chen, Gordon Smyth
-# Created 8 Jan 2013. Last revised 28 Feb 2014.
-{
-# Check dispersion estimates in y
- dispersion <- getDispersion(y)
- if(is.null(dispersion)) stop("Dispersion estimate not found. Please estimate the dispersion(s) before you proceed.")
-
-# Make default design matrix from group factor
- if(is.null(design)) {
- if(nlevels(y$samples$group)<2) stop("design not supplied and samples all belong to the same group")
- design <- model.matrix(~y$samples$group)
- rownames(design) <- colnames(y)
- }
- nbeta <- ncol(design)
- if(nbeta < 2) stop("design matrix must have at least two columns")
-
-# Check contrast
- if(length(contrast) == 1) {
- u <- rep.int(0, nbeta)
- u[contrast] <- 1
- contrast <- u
- }
- if(length(contrast) != nbeta) stop("length of contrast must match column dimension of design")
- if(all(contrast==0)) stop("contrast all zero")
-
-# Construct null hypothesis design matrix
- QR <- qr(contrast)
- design0 <- t(qr.qty(QR, t(design))[-1, , drop=FALSE])
-
-# Null hypothesis fit
- fit.null <- glmFit(y, design0, prior.count=0)
-
-# Quantile residuals from null fit
- y <- zscoreNBinom(y$counts, mu=fit.null$fitted.values, size=1/dispersion)
-
- NextMethod("mroast")
-}
-
diff --git a/R/romer.DGEList.R b/R/romer.DGEList.R
deleted file mode 100644
index d98f4b8..0000000
--- a/R/romer.DGEList.R
+++ /dev/null
@@ -1,39 +0,0 @@
-romer.DGEList <- function(y, index, design=NULL, contrast=ncol(design), ...)
-# rotation mean-rank version of GSEA (gene set enrichment analysis) for RNA-Seq data
-# Yunshun Chen, Gordon Smyth
-# Created 20 Oct 2014.
-{
-# Check dispersion estimates in y
- dispersion <- getDispersion(y)
- if(is.null(dispersion)) stop("Dispersion estimate not found. Please estimate the dispersion(s) before you proceed.")
-
-# Make default design matrix from group factor
- if(is.null(design)) {
- if(nlevels(y$samples$group)<2) stop("design not supplied and samples all belong to the same group")
- design <- model.matrix(~y$samples$group)
- rownames(design) <- colnames(y)
- }
- nbeta <- ncol(design)
- if(nbeta < 2) stop("design matrix must have at least two columns")
-
-# Check contrast
- if(length(contrast) == 1) {
- u <- rep.int(0, nbeta)
- u[contrast] <- 1
- contrast <- u
- }
- if(length(contrast) != nbeta) stop("length of contrast must match column dimension of design")
- if(all(contrast==0)) stop("contrast all zero")
-
-# Construct null hypothesis design matrix
- QR <- qr(contrast)
- design0 <- t(qr.qty(QR, t(design))[-1, , drop=FALSE])
-
-# Null hypothesis fit
- fit.null <- glmFit(y, design0, prior.count=0)
-
-# Quantile residuals from null fit
- y <- zscoreNBinom(y$counts, mu=fit.null$fitted.values, size=1/dispersion)
-
- NextMethod("romer")
-}
diff --git a/R/rpkm.R b/R/rpkm.R
index 2eb3081..5ae7929 100644
--- a/R/rpkm.R
+++ b/R/rpkm.R
@@ -27,7 +27,7 @@ rpkm.DGEList <- function(x, gene.length=NULL, normalized.lib.sizes=TRUE, log=FAL
lib.size <- x$samples$lib.size
if(normalized.lib.sizes) lib.size <- lib.size*x$samples$norm.factors
- rpkm.default(x=x$counts,gene.length=gene.length,lib.size=lib.size,log=log,prior.count=prior.count,...)
+ rpkm.default(x=x$counts,gene.length=gene.length,lib.size=lib.size,log=log,prior.count=prior.count, ...)
}
rpkm.default <- function(x, gene.length, lib.size=NULL, log=FALSE, prior.count=0.25, ...)
@@ -35,7 +35,7 @@ rpkm.default <- function(x, gene.length, lib.size=NULL, log=FALSE, prior.count=0
# Gordon Smyth
# Created 1 November 2012. Last modified 18 March 2014.
{
- y <- cpm.default(x=x,lib.size=lib.size,log=log,prior.count=prior.count)
+ y <- cpm.default(x=x,lib.size=lib.size,log=log,prior.count=prior.count, ...)
gene.length.kb <- gene.length/1000
if(log)
y-log2(gene.length.kb)
diff --git a/R/subsetting.R b/R/subsetting.R
index b4e148c..1f0705f 100644
--- a/R/subsetting.R
+++ b/R/subsetting.R
@@ -5,6 +5,8 @@ function(object, i, j, keep.lib.sizes=TRUE)
# Subsetting for DGEList objects
# 24 September 2009. Last modified 8 Feb 2015.
{
+ if(nargs() < 3) stop("Two subscripts required",call.=FALSE)
+
# Recognized components
IJ <- c("counts","pseudo.counts","offset","weights")
IX <- c("genes")
@@ -23,12 +25,13 @@ function(object, i, j)
# Subsetting for DGEGLM objects
# 11 May 2011. Last modified 09 May 2014.
{
+ if(nargs() != 3) stop("Two subscripts required",call.=FALSE)
if(!missing(j)) stop("Subsetting columns not allowed for DGEGLM object.",call.=FALSE)
# Recognized components
IJ <- character(0)
IX <- c("counts","offset","weights","genes","coefficients","fitted.values","unshrunk.coefficients")
- I <- c("AveLogCPM","trended.dispersion","tagwise.dispersion","prior.n","prior.df","dispersion","df.residual","deviance","iter","failed")
+ I <- c("AveLogCPM","trended.dispersion","tagwise.dispersion","prior.n","prior.df","dispersion","df.residual","df.residual.zeros","deviance","iter","failed")
JX <- c("samples","design")
subsetListOfArrays(object,i,j,IJ=IJ,IX=IX,I=I,JX=JX)
@@ -41,6 +44,7 @@ function(object, i, j)
# Davis McCarthy, Gordon Smyth
# 6 October 2010. Last modified 11 Dec 2013.
{
+ if(nargs() != 3) stop("Two subscripts required",call.=FALSE)
if(!missing(j)) stop("Subsetting columns not allowed for DGEExact objects.",call.=FALSE)
if(!missing(i)) {
object$table <- object$table[i,,drop=FALSE]
@@ -55,12 +59,13 @@ function(object, i, j)
# Subsetting for DGELRT objects
# 6 April 2011. Last modified 09 May 2014.
{
+ if(nargs() != 3) stop("Two subscripts required",call.=FALSE)
if(!missing(j)) stop("Subsetting columns not allowed for DGELRT object.",call.=FALSE)
# Recognized components
IJ <- character(0)
IX <- c("counts","offset","weights","genes","coefficients","fitted.values","table","unshrunk.coefficients")
- I <- c("AveLogCPM","trended.dispersion","tagwise.dispersion","prior.n","prior.df","dispersion","df.residual","deviance","iter","failed","df.test")
+ I <- c("AveLogCPM","trended.dispersion","tagwise.dispersion","prior.n","prior.df","dispersion","df.residual","df.residual.zeros","deviance","iter","failed","df.test")
JX <- character(0)
subsetListOfArrays(object,i,j,IJ=IJ,IX=IX,I=I,JX=JX)
@@ -72,6 +77,7 @@ function(object, i, j)
# Subsetting for TopTags objects
# 7 October 2009. Last modified 11 Dec 2013.
{
+ if(nargs() != 3) stop("Two subscripts required",call.=FALSE)
if(!missing(i) || !missing(j)) object$table <- object$table[i,j,drop=FALSE]
object
})
diff --git a/build/vignette.rds b/build/vignette.rds
index 3114448..2aa495e 100644
Binary files a/build/vignette.rds and b/build/vignette.rds differ
diff --git a/inst/doc/edgeR.pdf b/inst/doc/edgeR.pdf
index 94ca095..86fd262 100644
Binary files a/inst/doc/edgeR.pdf and b/inst/doc/edgeR.pdf differ
diff --git a/man/camera.DGEList.Rd b/man/camera.DGEList.Rd
index 08d7d5f..6979735 100644
--- a/man/camera.DGEList.Rd
+++ b/man/camera.DGEList.Rd
@@ -12,7 +12,7 @@ Test whether a set of genes is highly ranked relative to other genes in terms of
\item{y}{a \code{DGEList} object containing dispersion estimates.}
\item{index}{an index vector or a list of index vectors. Can be any vector such that \code{y[indices,]} selects the rows corresponding to the test set.}
\item{design}{the design matrix.}
- \item{contrast}{the contrast of the linear model coefficients for which the test is required. Can be an integer specifying a column of \code{design}, or else a numeric vector of same length as the number of columns of \code{design}.}
+ \item{contrast}{the contrast of the linear model coefficients for which the test is required. Can be an integer specifying a column of \code{design}, or the name of a column of \code{design}, or else a numeric vector of same length as the number of columns of \code{design}.}
\item{\dots}{other arguments are passed to \code{\link{camera.default}}.}
}
diff --git a/man/cpm.Rd b/man/cpm.Rd
index 45cc321..2e5423f 100644
--- a/man/cpm.Rd
+++ b/man/cpm.Rd
@@ -23,7 +23,7 @@
\item{log}{logical, if \code{TRUE} then \code{log2} values are returned.}
\item{prior.count}{average count to be added to each observation to avoid taking log of zero. Used only if \code{log=TRUE}.}
\item{gene.length}{vector of length \code{nrow(x)} giving gene length in bases, or the name of the column \code{x$genes} containing the gene lengths.}
-\item{\ldots}{other arguments are not currently used}
+\item{\dots}{other arguments that are not currently used.}
}
\value{numeric matrix of CPM or RPKM values.}
diff --git a/man/diffSpliceDGE.Rd b/man/diffSpliceDGE.Rd
index 0bd4376..e87548a 100644
--- a/man/diffSpliceDGE.Rd
+++ b/man/diffSpliceDGE.Rd
@@ -4,12 +4,13 @@
\title{Test for Differential Exon Usage}
\description{Given a negative binomial generalized log-linear model fit at the exon level, test for differential exon usage between experimental conditions.}
\usage{
-diffSpliceDGE(fit.exon, coef=ncol(fit.exon$design), geneid, exonid=NULL, verbose=TRUE)
+diffSpliceDGE(fit.exon, coef=ncol(fit.exon$design), contrast=NULL, geneid, exonid=NULL, verbose=TRUE)
}
\arguments{
\item{fit.exon}{an \code{DGEGLM} fitted model object produced by \code{glmFit}. Rows should correspond to exons.}
\item{coef}{integer indicating which coefficient of the generalized linear model is to be tested for differential exon usage. Defaults to the last coefficient.}
+ \item{contrast}{numeric vector specifying the contrast of the linear model coefficients to be tested for differential exon usage. Length must equal to the number of columns of \code{design}. If specified, then takes precedence over \code{coef}.}
\item{geneid}{gene identifiers. Either a vector of length \code{nrow(fit.exon)} or the name of the column of \code{fit.exon$genes} containing the gene identifiers. Rows with the same ID are assumed to belong to the same gene.}
\item{exonid}{exon identifiers. Either a vector of length \code{nrow(fit.exon)} or the name of the column of \code{fit.exon$genes} containing the exon identifiers.}
\item{verbose}{logical, if \code{TRUE} some diagnostic information about the number of genes and exons is output.}
diff --git a/man/dimnames.Rd b/man/dimnames.Rd
index d89a87c..65f52b0 100644
--- a/man/dimnames.Rd
+++ b/man/dimnames.Rd
@@ -5,7 +5,9 @@
\alias{dimnames.DGELRT}
\alias{dimnames.TopTags}
\alias{dimnames<-.DGEList}
+\alias{dimnames<-.DGEExact}
\alias{dimnames<-.DGEGLM}
+\alias{dimnames<-.DGELRT}
\title{Retrieve the Dimension Names of a DGE Object}
\description{
@@ -24,7 +26,7 @@ The dimension names of a DGE data object are the same as those of the most impor
Setting dimension names is currently only permitted for \code{DGEList} or \code{DGEGLM} objects.
-A consequence is that \code{rownames} and \code{colnames} will work as expected.
+A consequence of these methods is that \code{rownames}, \code{colnames}, \code{rownames<-} and \code{colnames<-} will also work as expected on any of the above object classes.
}
\value{
Either \code{NULL} or a list of length 2.
diff --git a/man/estimateCommonDisp.Rd b/man/estimateCommonDisp.Rd
index c291994..0f12211 100644
--- a/man/estimateCommonDisp.Rd
+++ b/man/estimateCommonDisp.Rd
@@ -4,7 +4,7 @@
\title{Estimate Common Negative Binomial Dispersion by Conditional Maximum Likelihood}
\description{
-Maximizes the negative binomial conditional common likelihood to give the estimate of the common dispersion across all genes.
+Maximizes the negative binomial conditional common likelihood to estimate a common dispersion value across all genes.
}
\usage{
@@ -12,41 +12,47 @@ estimateCommonDisp(object, tol=1e-06, rowsum.filter=5, verbose=FALSE)
}
\arguments{
-\item{object}{\code{DGEList} object}
+\item{object}{a \code{DGEList} object.}
-\item{tol}{the desired accuracy, passed to \code{\link{optimize}}}
+\item{tol}{the desired accuracy, passed to \code{\link{optimize}}.}
-\item{rowsum.filter}{numeric scalar giving a value for the filtering out of low abundance genes in the estimation of the common dispersion.
-Only genes with total sum of counts above this value are used in the estimation of the common dispersion.}
+\item{rowsum.filter}{genes with total count (across all samples) below this value will be filtered out before estimating the dispersion.}
-\item{verbose}{logical, if \code{TRUE} estimated dispersion and BCV will be printed to standard output.}
+\item{verbose}{logical, if \code{TRUE} then the estimated dispersion and BCV will be printed to standard output.}
}
\value{Returns \code{object} with the following added components:
\item{common.dispersion}{estimate of the common dispersion.}
- \item{pseudo.counts}{numeric matrix of quantile-quantile normalized counts. These are counts adjusted so that the library sizes are equal, while preserving differences between groups and variability within each group.}
- \item{pseudo.lib.size}{the common library size to which the counts have been adjusted}
+ \item{pseudo.counts}{numeric matrix of pseudo-counts.}
+ \item{pseudo.lib.size}{the common library size to which the pseudo-counts have been adjusted}
}
\details{
-Implements the method of Robinson and Smyth (2008) for estimating a common dispersion parameter by conditional maximum likelihood.
-The method of conditional maximum likelihood assumes that library sizes are equal, which is not true in general, so pseudocounts (counts adjusted so that the library sizes are equal) need to be calculated. The function \code{equalizeLibSizes} is called to adjust the counts using a quantile-to-quantile method, but this requires a fixed value for the common dispersion parameter. To obtain a good estimate for the common dispersion, pseudocounts are calculated under the Poisson model (disper [...]
+Implements the conditional maximum likelihood (CML) method proposed by Robinson and Smyth (2008) for estimating a common dispersion parameter.
+This method proves to be accurate and nearly unbiased even for small counts and small numbers of replicates.
+
+The CML method involves computing a matrix of quantile-quantile normalized counts, called pseudo-counts.
+The pseudo-counts are adjusted in such a way that the library sizes are equal for all samples, while preserving differences between groups and variability within each group.
+The pseudo-counts are included in the output of the function, but are intended mainly for internal edgeR use.
}
\references{
-Robinson MD and Smyth GK (2008). Small-sample estimation of negative
-binomial dispersion, with applications to SAGE data. \emph{Biostatistics},
-9, 321-332
+Robinson MD and Smyth GK (2008).
+Small-sample estimation of negative binomial dispersion, with applications to SAGE data.
+\emph{Biostatistics}, 9, 321-332.
+\url{http://biostatistics.oxfordjournals.org/content/9/2/321}
}
\author{Mark Robinson, Davis McCarthy, Gordon Smyth}
\examples{
# True dispersion is 1/5=0.2
-y <- matrix(rnbinom(1000,mu=10,size=5),ncol=4)
-d <- DGEList(counts=y,group=c(1,1,2,2),lib.size=c(1000:1003))
-d <- estimateCommonDisp(d, verbose=TRUE)
+y <- matrix(rnbinom(250*4,mu=20,size=5),nrow=250,ncol=4)
+dge <- DGEList(counts=y,group=c(1,1,2,2))
+dge <- estimateCommonDisp(dge, verbose=TRUE)
}
\seealso{
-\code{\link{equalizeLibSizes}}
+\code{\link{equalizeLibSizes}},
+\code{\link{estimateTrendedDisp}},
+\code{\link{estimateTagwiseDisp}}
}
diff --git a/man/expandAsMatrix.Rd b/man/expandAsMatrix.Rd
index 6137f82..c91560d 100644
--- a/man/expandAsMatrix.Rd
+++ b/man/expandAsMatrix.Rd
@@ -5,11 +5,12 @@
Expand scalar or vector to a matrix.
}
\usage{
-expandAsMatrix(x, dim)
+expandAsMatrix(x, dim=NULL, byrow=TRUE)
}
\arguments{
\item{x}{scalar, vector or matrix. If a vector, length must match one of the output dimensions.}
\item{dim}{required dimension for the output matrix.}
+ \item{byrow}{logical. Should the matrix be filled by columns or by rows (the default) if the length of the input vector is equal to both dimensions?}
}
\details{
diff --git a/man/glmQLFTest.Rd b/man/glmQLFTest.Rd
index cd14e57..5d0f8af 100644
--- a/man/glmQLFTest.Rd
+++ b/man/glmQLFTest.Rd
@@ -12,7 +12,7 @@ Conduct genewise statistical tests for a given coefficient or contrast.}
\usage{
\method{glmQLFit}{DGEList}(y, design=NULL, dispersion=NULL, offset=NULL, abundance.trend=TRUE,
robust=FALSE, winsor.tail.p=c(0.05, 0.1), \dots)
-\method{glmQLFit}{default}(y, design=NULL, dispersion=0.05, offset=NULL, lib.size=NULL,
+\method{glmQLFit}{default}(y, design=NULL, dispersion=NULL, offset=NULL, lib.size=NULL,
abundance.trend=TRUE, AveLogCPM=NULL, robust=FALSE,
winsor.tail.p=c(0.05, 0.1), \dots)
glmQLFTest(glmfit, coef=ncol(glmfit$design), contrast=NULL)
@@ -23,7 +23,7 @@ glmQLFTest(glmfit, coef=ncol(glmfit$design), contrast=NULL)
\item{design}{numeric matrix giving the design matrix for the genewise linear models.}
-\item{dispersion}{numeric scalar or vector of negative binomial dispersions. If \code{NULL}, then will be extracted from the \code{DGEList} object \code{y}, with order of precedence: trended dispersions, common dispersion, a constant value of 0.05.}
+\item{dispersion}{numeric scalar, vector or matrix of negative binomial dispersions. If \code{NULL}, then will be extracted from the \code{DGEList} object \code{y}, with order of precedence: trended dispersions, common dispersion, a constant value of 0.05.}
\item{offset}{numeric matrix of same size as \code{y} giving offsets for the log-linear models. Can be a scalor or a vector of length \code{ncol(y)}, in which case it is expanded out to a matrix. If \code{NULL} will be computed by \code{getOffset(y)}.}
diff --git a/man/glmTreat.Rd b/man/glmTreat.Rd
index 3aafb61..e709163 100644
--- a/man/glmTreat.Rd
+++ b/man/glmTreat.Rd
@@ -14,21 +14,22 @@ treatDGE(glmfit, coef=ncol(glmfit$design), contrast=NULL, lfc=0)
\arguments{
\item{glmfit}{a \code{DGEGLM} object, usually output from \code{glmFit} or \code{glmQLFit}.}
-\item{coef}{integer or character vector indicating which coefficients of the linear model are to be tested equal to zero. Values must be columns or column names of \code{design}. Defaults to the last coefficient. Ignored if \code{contrast} is specified.}
+\item{coef}{integer or character vector indicating which coefficients of the linear model are to be tested equal to zero. Values must be columns or column names of \code{design}. Defaults to the last coefficient. Ignored if \code{contrast} is specified.}
-\item{contrast}{numeric vector specifying the contrast of the linear model coefficients to be tested against the log2-fold change threshold. Length must equal to the number of columns of \code{design}. If specified, then takes precedence over \code{coef}.}
+\item{contrast}{numeric vector specifying the contrast of the linear model coefficients to be tested against the log2-fold-change threshold. Length must equal to the number of columns of \code{design}. If specified, then takes precedence over \code{coef}.}
\item{lfc}{numeric scalar specifying the absolute value of the log2-fold change threshold above which differential expression is to be considered.}
}
\value{
\code{glmTreat} produces an object of class \code{DGELRT} with the same components as for \code{glmfit} plus the following:
- \item{lfc}{absolute value of the specified log2-fold change threshold.}
- \item{table}{data frame with the same rows as \code{glmfit} containing the log2-fold changes, average log2-counts per million and p-values, ready to be displayed by \code{topTags.}.}
+ \item{lfc}{absolute value of the specified log2-fold-change threshold.}
+ \item{table}{data frame with the same rows as \code{glmfit} containing the log2-fold-changes, average log2-counts per million and p-values, ready to be displayed by \code{topTags}.}
\item{comparison}{character string describing the coefficient or the contrast being tested.}
The data frame \code{table} contains the following columns:
- \item{logFC}{log2-fold change of expression between conditions being tested.}
+ \item{logFC}{shrunk log2-fold-change of expression between conditions being tested.}
+ \item{unshrunk.logFC}{unshrunk log2-fold-change of expression between conditions being tested. Exists only when \code{prior.count} is not equal to 0 for \code{glmfit}.}
\item{logCPM}{average log2-counts per million, the average taken over all libraries.}
\item{PValue}{p-values.}
}
@@ -44,6 +45,9 @@ In the latter case, it conducts a quasi-likelihood (QL) F-test against the thres
If \code{lfc=0}, then \code{glmTreat} is equivalent to \code{glmLRT} or \code{glmQLFTest}, depending on whether likelihood or quasi-likelihood is being used.
+If there is no shrinkage on log-fold-changes, i.e., fitting glms with \code{prior.count=0}, then \code{unshrunk.logFC} and \code{logFC} are essentially the same. Hence they are merged into one column of \code{logFC} in \code{table}.
+Note that \code{glmTreat} constructs test statistics using \code{unshrunk.logFC} rather than \code{logFC}.
+
\code{glmTreat} was previously called \code{treatDGE}.
The old function name is now deprecated and will be removed in a future release of edgeR.
}
diff --git a/man/glmfit.Rd b/man/glmfit.Rd
index 541342c..caa2665 100644
--- a/man/glmfit.Rd
+++ b/man/glmfit.Rd
@@ -13,7 +13,7 @@ Conduct genewise statistical tests for a given coefficient or coefficient contra
\method{glmFit}{DGEList}(y, design=NULL, dispersion=NULL, prior.count=0.125, start=NULL, \dots)
\method{glmFit}{default}(y, design=NULL, dispersion=NULL, offset=NULL, lib.size=NULL, weights=NULL,
prior.count=0.125, start=NULL, \dots)
-glmLRT(glmfit, coef=ncol(glmfit$design), contrast=NULL, test="chisq")
+glmLRT(glmfit, coef=ncol(glmfit$design), contrast=NULL)
}
\arguments{
@@ -23,7 +23,7 @@ glmLRT(glmfit, coef=ncol(glmfit$design), contrast=NULL, test="chisq")
Must be of full column rank.
Defaults to a single column of ones, equivalent to treating the columns as replicate libraries.}
-\item{dispersion}{numeric scalar or vector of negative binomial dispersions. Can be a common value for all genes, or a vector of values can provide a unique dispersion value for each gene. If \code{NULL} will be extracted from \code{y}, with order of precedence: genewise dispersion, trended dispersions, common dispersion.}
+\item{dispersion}{numeric scalar, vector or matrix of negative binomial dispersions. Can be a common value for all genes, a vector of dispersion values with one for each gene, or a matrix of dispersion values with one for each observation. If \code{NULL} will be extracted from \code{y}, with order of precedence: genewise dispersion, trended dispersions, common dispersion.}
\item{offset}{numeric matrix of same size as \code{y} giving offsets for the log-linear models. Can be a scalor or a vector of length \code{ncol(y)}, in which case it is expanded out to a matrix.}
@@ -42,8 +42,6 @@ Defaults to a single column of ones, equivalent to treating the columns as repli
\item{coef}{integer or character vector indicating which coefficients of the linear model are to be tested equal to zero. Values must be columns or column names of \code{design}. Defaults to the last coefficient. Ignored if \code{contrast} is specified.}
\item{contrast}{numeric vector or matrix specifying one or more contrasts of the linear model coefficients to be tested equal to zero. Number of rows must equal to the number of columns of \code{design}. If specified, then takes precedence over \code{coef}.}
-
-\item{test}{which test (distribution) to use in calculating the p-values. Possible values are \code{"F"} or \code{"chisq"}.}
}
\value{
@@ -54,11 +52,12 @@ Defaults to a single column of ones, equivalent to treating the columns as repli
\item{offset}{numeric matrix of linear model offsets.}
\item{dispersion}{vector of dispersions used for the fit.}
\item{coefficients}{numeric matrix of estimated coefficients from the glm fits, on the natural log scale, of size \code{nrow(y)} by \code{ncol(design)}.}
+ \item{unshrunk.coefficients}{numeric matrix of estimated coefficients from the glm fits when no log-fold-changes shrinkage is applied, on the natural log scale, of size \code{nrow(y)} by \code{ncol(design)}. It exists only when \code{prior.count} is not 0.}
\item{fitted.values}{matrix of fitted values from glm fits, same number of rows and columns as \code{y}.}
\item{deviance}{numeric vector of deviances, one for each gene.}
\code{glmLRT} produces objects of class \code{DGELRT} with the same components as for \code{glmfit} plus the following:
- \item{table}{data frame with the same rows as \code{y} containing the log2-fold changes, likelhood ratio statistics and p-values, ready to be displayed by \code{topTags.}.}
+ \item{table}{data frame with the same rows as \code{y} containing the log2-fold-changes, likelhood ratio statistics and p-values, ready to be displayed by \code{topTags}.}
\item{comparison}{character string describing the coefficient or the contrast being tested.}
The data frame \code{table} contains the following columns:
diff --git a/man/goana.Rd b/man/goana.Rd
index 8cb755f..7d12c32 100644
--- a/man/goana.Rd
+++ b/man/goana.Rd
@@ -1,26 +1,28 @@
\name{goana.DGELRT}
\alias{goana.DGEExact}
\alias{goana.DGELRT}
-\title{Gene Ontology Analysis of Differentially Expressed Genes}
+\alias{kegga.DGEExact}
+\alias{kegga.DGELRT}
+\title{Gene Ontology or KEGG Analysis of Differentially Expressed Genes}
\description{
-Test for over-representation of gene ontology (GO) terms in the up and down differentially expressed genes from a linear model fit.
+Test for over-representation of gene ontology (GO) terms or KEGG pathways in the up and down differentially expressed genes from a linear model fit.
}
\usage{
-\method{goana}{DGELRT}(de, geneid = rownames(de), FDR = 0.05, species = "Hs",
- trend = FALSE, \dots)
+\method{goana}{DGELRT}(de, geneid = rownames(de), FDR = 0.05, trend = FALSE, \dots)
+\method{kegga}{DGELRT}(de, geneid = rownames(de), FDR = 0.05, trend = FALSE, \dots)
}
\arguments{
\item{de}{an \code{DGELRT} object.}
\item{geneid}{Entrez Gene identifiers. Either a vector of length \code{nrow(de)} or the name of the column of \code{de$genes} containing the Entrez Gene IDs.}
\item{FDR}{false discovery rate cutoff for differentially expressed genes. Numeric value between 0 and 1.}
- \item{species}{species identifier. Possible values are \code{"Hs"}, \code{"Mm"}, \code{"Rn"} or \code{"Dm"}.}
\item{trend}{adjust analysis for gene length or abundance?
Can be logical, or a numeric vector of covariate values, or the name of the column of \code{de$genes} containing the covariate values.
If \code{TRUE}, then \code{de$AveLogCPM} is used as the covariate.}
- \item{\dots}{any other arguments are passed to \code{goana.default}.}
+ \item{\dots}{any other arguments are passed to \code{\link{goana.default}}.}
}
\details{
-Performs Gene Ontology enrichment analyses for the up and down differentially expressed genes from a linear model analysis.
+\code{goana} performs Gene Ontology enrichment analyses for the up and down differentially expressed genes from a linear model analysis.
+\code{kegga} performs the corresponding analysis for KEGG pathways.
The Entrez Gene ID must be supplied for each gene.
If \code{trend=FALSE}, the function computes one-sided hypergeometric tests equivalent to Fisher's exact test.
@@ -29,7 +31,7 @@ If \code{trend=TRUE} or a covariate is supplied, then a trend is fitted to the d
The adjusted test uses Wallenius' noncentral hypergeometric distribution.
}
\value{
-A data frame with a row for each GO term and the following columns:
+\code{goana} produces a data.frame with a row for each GO term and the following columns:
\item{Term}{GO term.}
\item{Ont}{ontology that the GO term belongs to. Possible values are \code{"BP"}, \code{"CC"} and \code{"MF"}.}
\item{N}{Number of genes in the GO term.}
@@ -38,6 +40,8 @@ A data frame with a row for each GO term and the following columns:
\item{P.Up}{p-value for over-representation of GO term in up-regulated genes.}
\item{P.Down}{p-value for over-representation of GO term in down-regulated genes.}
The row names of the data frame give the GO term IDs.
+
+\code{kegga} produces a data.frame as above except that the rownames are KEGG pathway IDs, Term become Path and there is no Ont column.
}
\references{
@@ -48,17 +52,17 @@ The row names of the data frame give the GO term IDs.
}
\seealso{
-\code{\link{goana}}, \code{\link{topGO}}
+\code{\link{goana}}, \code{\link{topGO}}, \code{\link{kegga}}, \code{\link{topKEGG}}
}
-\author{Gordon Smyth and Yifang Hu}
+\author{Yunshun Chen and Gordon Smyth}
\examples{
\dontrun{
fit <- glmFit(y, design)
lrt <- glmLRT(fit)
-go <- goana(lrt)
+go <- goana(lrt, species="Hs)
topGO(go, ont="BP", sort = "up")
topGO(go, ont="BP", sort = "down")
}
diff --git a/man/plotMD.Rd b/man/plotMD.Rd
new file mode 100644
index 0000000..c6c189d
--- /dev/null
+++ b/man/plotMD.Rd
@@ -0,0 +1,77 @@
+\title{Mean-Difference Plot of Count Data}
+\name{plotMD.DGEList}
+\alias{plotMD.DGEList}
+\alias{plotMD.DGEGLM}
+\alias{plotMD.DGELRT}
+\alias{plotMD.DGEExact}
+
+\description{
+Creates a mean-difference plot (aka MA plot) with color coding for highlighted points.
+}
+
+\usage{
+\method{plotMD}{DGEList}(object, column = 1, xlab = "Average log CPM (this sample and others)",
+ ylab = "log-ratio (this sample vs others)",
+ main = colnames(object)[column], status=object$genes$Status,
+ zero.weights = FALSE, prior.count = 3, \dots)
+\method{plotMD}{DGEGLM}(object, column = ncol(object), coef = NULL, xlab = "Average log CPM",
+ ylab = "log-fold-change", main = colnames(object)[column],
+ status=object$genes$Status, zero.weights = FALSE, \dots)
+\method{plotMD}{DGELRT}(object, xlab = "Average log CPM",
+ ylab = "log-fold-change", main = object$comparison,
+ status=object$genes$Status, \dots)
+}
+
+\arguments{
+ \item{object}{an object of class \code{DGEList}, \code{DGEGLM}, \code{DGEGLM} or \code{DGEExact}.}
+ \item{column}{integer, column of \code{object} to be plotted.}
+ \item{coef}{alternative to \code{column} for fitted model objects. If specified, then \code{column} is ignored.}
+ \item{xlab}{character string, label for x-axis}
+ \item{ylab}{character string, label for y-axis}
+ \item{main}{character string, title for plot}
+ \item{status}{vector giving the control status of each spot on the array, of same length as the number of rows of \code{object}.
+ If \code{NULL}, then all points are plotted in the default color, symbol and size.}
+ \item{zero.weights}{logical, should spots with zero or negative weights be plotted?}
+ \item{prior.count}{the average prior count to be added to each observation. Larger values produce more shrinkage.}
+ \item{\dots}{other arguments are passed to \code{\link{plotWithHighlights}}.}
+}
+
+\details{
+A mean-difference plot (MD-plot) is a plot of log fold changes (differences) versus average log values (means).
+The history of mean-difference plots and MA-plots is reviewed in Ritchie et al (2015).
+
+For \code{DGEList} objects, a between-sample MD-plot is produced.
+Counts are first converted to log2-CPM values.
+An articifial array is produced by averaging all the samples other than the sample specified.
+A mean-difference plot is then producing from the specified sample and the artificial sample.
+This procedure reduces to an ordinary mean-difference plot when there are just two arrays total.
+
+If \code{object} is an \code{DGEGLM} object, then the plot is an fitted model MD-plot in which the estimated coefficient is on the y-axis and the average logCPM value is on the x-axis.
+If \code{object} is an \code{DGEExact} or \code{DGELRT} object, then the MD-plot displays the logFC vs the logCPM values from the results table.
+
+The \code{status} vector can correspond to any grouping of the probes that is of interest.
+If \code{object} is a fitted model object, then \code{status} vector is often used to indicate statistically significance, so that differentially expressed points are highlighted.
+
+The \code{status} can be included as the component \code{object$genes$Status} instead of being passed as an argument to \code{plotMD}.
+
+See \code{\link{plotWithHighlights}} for how to set colors and graphics parameters for the highlighted and non-highlighted points.
+}
+
+\value{A plot is created on the current graphics device.}
+
+\references{
+Ritchie, ME, Phipson, B, Wu, D, Hu, Y, Law, CW, Shi, W, and Smyth, GK (2015).
+limma powers differential expression analyses for RNA-sequencing and microarray studies.
+\emph{Nucleic Acids Research} Volume 43, e47.
+\url{http://nar.oxfordjournals.org/content/43/7/e47}
+}
+
+\author{Gordon Smyth}
+
+\seealso{
+\code{plotSmear}
+
+The driver function for \code{plotMD} is \code{\link{plotWithHighlights}}.
+}
+
+\keyword{hplot}
diff --git a/man/readDGE.Rd b/man/readDGE.Rd
index 4ea86f5..6c9a3c6 100755
--- a/man/readDGE.Rd
+++ b/man/readDGE.Rd
@@ -1,29 +1,39 @@
\name{readDGE}
\alias{readDGE}
-\title{Read and Merge a Set of Files Containing DGE Data}
+\title{Read and Merge a Set of Files Containing Count Data}
-\description{Reads and merges a set of text files containing digital gene expression data.}
+\description{Reads and merges a set of text files containing gene expression counts.}
\usage{readDGE(files, path=NULL, columns=c(1,2), group=NULL, labels=NULL, \dots)}
\arguments{
-\item{files}{character vector of filenames, or alternatively a data.frame with a column containing the file names of the files containing the libraries of counts and, optionally, columns containing the \code{group} to which each library belongs, descriptions of the other samples and other information.}
+\item{files}{character vector of filenames, or a data.frame of sample information containing a column called \code{files}.}
\item{path}{character string giving the directory containing the files.
-The default is the current working directory.}
-\item{columns}{numeric vector stating which two columns contain the gene names and counts, respectively}
-\item{group}{vector, or preferably a factor, indicating the experimental group to which each library belongs. If \code{group} is not \code{NULL}, then this argument overrides any group information included in the \code{files} argument.}
-\item{labels}{character vector giving short names to associate with the libraries. Defaults to the file names.}
-\item{\dots}{other are passed to \code{read.delim}}
+Defaults to the current working directory.}
+\item{columns}{numeric vector stating which columns of the input files contain the gene names and counts respectively.}
+\item{group}{optional vector or factor indicating the experimental group to which each file belongs.}
+\item{labels}{character vector giving short names to associate with the files. Defaults to the file names.}
+\item{\dots}{other arguments are passed to \code{\link{read.delim}}.}
}
\details{
-Each file is assumed to contained digital gene expression data for one sample (or library), with gene identifiers in the first column and counts in the second column. Gene identifiers are assumed to be unique and not repeated in any one file.
+Each file is assumed to contain digital gene expression data for one genomic sample or count library, with gene identifiers in the first column and counts in the second column.
+Gene identifiers are assumed to be unique and not repeated in any one file.
+The function creates a combined table of counts with rows for genes and columns for samples.
+A count of zero will be entered for any gene that was not found in any particular sample.
+
By default, the files are assumed to be tab-delimited and to contain column headings.
-The function forms the union of all genes and creates one big table with zeros where necessary.
+Other file formats can be handled by adding arguments to be passed to \code{read.delim}.
+For example, use \code{header=FALSE} if there are no column headings and use \code{sep=","} to read a comma-separated file.
+
+Instead of being a vector, the argument \code{files} can be a data.frame containing all the necessary sample information.
+In that case, the filenames and group identifiers can be given as columns \code{files} and \code{group} respectively, and the \code{labels} can be given as the row.names of the data.frame.
}
-\value{DGEList object}
+\value{
+A \code{\link[edgeR:DGEList-class]{DGEList}} object containing a matrix of counts, with a row for each unique tag found in the input files and a column for each input file.
+}
\author{Mark Robinson and Gordon Smyth}
@@ -35,7 +45,9 @@ RG <- readDGE(files)}
}
\seealso{
-\code{\link{DGEList}} provides more information about the \code{DGEList} class and the function \code{DGEList}, which can also be used to construct a \code{DGEList} object, if \code{readDGE} is not required to read in and construct a table of counts from separate files.
+See \code{\link{read.delim}} for other possible arguments that can be accepted.
+
+\code{\link{DGEList-class}}, \code{\link{DGEList}}.
}
\keyword{file}
diff --git a/man/roast.DGEList.Rd b/man/roast.DGEList.Rd
index 45593ef..3c64814 100644
--- a/man/roast.DGEList.Rd
+++ b/man/roast.DGEList.Rd
@@ -15,7 +15,7 @@ Rotation gene set testing for Negative Binomial generalized linear models.
\item{y}{\code{DGEList} object.}
\item{index}{index vector specifying which rows (genes) of \code{y} are in the test set. This can be a vector of indices, or a logical vector of the same length as \code{statistics}, or any vector such as \code{y[iset,]} contains the values for the gene set to be tested. Defaults to all genes. For \code{mroast} a list of index vectors.}
\item{design}{design matrix}
- \item{contrast}{contrast for which the test is required. Can be an integer specifying a column of \code{design}, or else a contrast vector of length equal to the number of columns of \code{design}.}
+ \item{contrast}{contrast for which the test is required. Can be an integer specifying a column of \code{design}, or the name of a column of \code{design}, or else a contrast vector of length equal to the number of columns of \code{design}.}
\item{\dots}{other arguments are passed to \code{\link{roast.default}} or \code{\link{mroast.default}}.}
}
diff --git a/man/romer.DGEList.Rd b/man/romer.DGEList.Rd
index 994ba38..c22af37 100644
--- a/man/romer.DGEList.Rd
+++ b/man/romer.DGEList.Rd
@@ -13,7 +13,7 @@ Rotation gene set testing for Negative Binomial generalized linear models.
\item{y}{\code{DGEList} object.}
\item{index}{list of indices specifying the rows of \code{y} in the gene sets. The list can be made using \link{ids2indices}.}
\item{design}{design matrix}
- \item{contrast}{contrast for which the test is required. Can be an integer specifying a column of \code{design}, or else a contrast vector of length equal to the number of columns of \code{design}.}
+ \item{contrast}{contrast for which the test is required. Can be an integer specifying a column of \code{design}, or the name of a column of \code{design}, or else a contrast vector of length equal to the number of columns of \code{design}.}
\item{\dots}{other arguments passed to \code{\link{romer.default}}.}
}
diff --git a/src/R_levenberg.cpp b/src/R_levenberg.cpp
index 7149c89..5aa701b 100644
--- a/src/R_levenberg.cpp
+++ b/src/R_levenberg.cpp
@@ -4,7 +4,7 @@
SEXP R_levenberg (SEXP nlib, SEXP ntag, SEXP design, SEXP counts, SEXP disp, SEXP offset, SEXP weights,
SEXP beta, SEXP fitted, SEXP tol, SEXP maxit) try {
if (!isNumeric(design)) { throw std::runtime_error("design matrix should be double precision"); }
- if (!isNumeric(disp)) { throw std::runtime_error("dispersion vector should be double precision"); }
+ if (!isNumeric(disp)) { throw std::runtime_error("dispersion matrix should be double precision"); }
if (!isNumeric(beta)) { throw std::runtime_error("matrix of start values for coefficients should be double precision"); }
if (!isNumeric(fitted)) { throw std::runtime_error("matrix of starting fitted values should be double precision"); }
const int num_tags=asInteger(ntag);
@@ -33,8 +33,8 @@ SEXP R_levenberg (SEXP nlib, SEXP ntag, SEXP design, SEXP counts, SEXP disp, SEX
throw std::runtime_error("dimensions of the beta matrix do not match to the number of tags and coefficients");
} else if (LENGTH(fitted)!=clen) {
throw std::runtime_error("dimensions of the fitted matrix do not match those of the count matrix");
- } else if (LENGTH(disp)!=num_tags) {
- throw std::runtime_error("length of dispersion vector must be equal to the number of tags");
+ } else if (LENGTH(disp)!=clen) {
+ throw std::runtime_error("dimensions of dispersion matrix is not as specified");
}
// Initializing pointers to the assorted features.
@@ -42,9 +42,9 @@ SEXP R_levenberg (SEXP nlib, SEXP ntag, SEXP design, SEXP counts, SEXP disp, SEX
*design_ptr=REAL(design),
*fitted_ptr=REAL(fitted),
*disp_ptr=REAL(disp);
- matvec_check allo(num_libs, num_tags, offset, true, "offset", false);
+ matvec_check allo(num_libs, num_tags, offset, true, "offset");
const double* const* optr2=allo.access();
- matvec_check allw(num_libs, num_tags, weights, true, "weight", true);
+ matvec_check allw(num_libs, num_tags, weights, true, "weight", 1);
const double* const* wptr2=allw.access();
// Initializing output cages.
@@ -80,7 +80,7 @@ SEXP R_levenberg (SEXP nlib, SEXP ntag, SEXP design, SEXP counts, SEXP disp, SEX
#ifdef WEIGHTED
*wptr2,
#endif
- *disp_ptr, new_fitted_ptr, new_beta_ptr)) {
+ disp_ptr, new_fitted_ptr, new_beta_ptr)) {
std::stringstream errout;
errout<< "solution using Cholesky decomposition failed for tag " << tag+1;
throw std::runtime_error(errout.str());
@@ -88,7 +88,7 @@ SEXP R_levenberg (SEXP nlib, SEXP ntag, SEXP design, SEXP counts, SEXP disp, SEX
allo.advance();
allw.advance();
- ++disp_ptr;
+ disp_ptr+=num_libs;
fitted_ptr+=num_libs;
new_fitted_ptr+=num_libs;
beta_ptr+=num_coefs;
diff --git a/src/R_one_group.cpp b/src/R_one_group.cpp
index 4459ff3..83d36cd 100644
--- a/src/R_one_group.cpp
+++ b/src/R_one_group.cpp
@@ -4,8 +4,8 @@
SEXP R_one_group (SEXP nlib, SEXP ntag, SEXP y, SEXP disp, SEXP offsets, SEXP weights, SEXP max_iterations, SEXP tolerance, SEXP beta) try {
const int num_tags=asInteger(ntag);
const int num_libs=asInteger(nlib);
- if (!isNumeric(disp)) { throw std::runtime_error("dispersion vector must be double precision"); }
- if (num_tags!=LENGTH(disp)) { throw std::runtime_error("length of dispersion vector is not equal to number of tags"); }
+ if (!isNumeric(disp)) { throw std::runtime_error("dispersion matrix must be double precision"); }
+ if (num_tags*num_libs !=LENGTH(disp)) { throw std::runtime_error("dimensions of dispersion vector is not equal to number of tags"); }
if (num_tags*num_libs != LENGTH(y) ) { throw std::runtime_error("dimensions of the count table are not as specified"); } // Checking that it is an exact division.
if (!isNumeric(beta)) { throw std::runtime_error("beta start vector must be double precision"); }
@@ -30,9 +30,9 @@ SEXP R_one_group (SEXP nlib, SEXP ntag, SEXP y, SEXP disp, SEXP offsets, SEXP we
if (!isNumeric(y)) { throw std::runtime_error("count matrix must be integer or double-precision"); }
ydptr=REAL(y);
}
- matvec_check allo(num_libs, num_tags, offsets, false, "offset", false);
+ matvec_check allo(num_libs, num_tags, offsets, false, "offset");
const double* const* optr2=allo.access();
- matvec_check allw(num_libs, num_tags, weights, false, "weight", true);
+ matvec_check allw(num_libs, num_tags, weights, false, "weight", 1);
const double* const* wptr2=allw.access();
const double* dptr=REAL(disp);
@@ -58,10 +58,11 @@ SEXP R_one_group (SEXP nlib, SEXP ntag, SEXP y, SEXP disp, SEXP offsets, SEXP we
#ifdef WEIGHTED
*wptr2,
#endif
- yptr, dptr[tag], (use_old_start ? bsptr[tag] : R_NaReal));
+ yptr, dptr, (use_old_start ? bsptr[tag] : R_NaReal));
bptr[tag]=out.first;
cptr[tag]=out.second;
+ dptr+=num_libs;
allo.advance();
allw.advance();
}
diff --git a/src/glm.h b/src/glm.h
index 5fbac85..2bc6cd8 100644
--- a/src/glm.h
+++ b/src/glm.h
@@ -8,7 +8,7 @@ std::pair<double,bool> glm_one_group(const int&, const int&, const double&, cons
#ifdef WEIGHTED
const double*,
#endif
- const double*, const double&, double);
+ const double*, const double*, double);
class glm_levenberg {
public:
@@ -18,7 +18,7 @@ public:
#ifdef WEIGHTED
const double*,
#endif
- const double&, double*, double*);
+ const double*, double*, double*);
const bool& is_failure() const;
const int& get_iterations() const;
@@ -46,7 +46,7 @@ private:
#ifdef WEIGHTED
const double*,
#endif
- const double&) const;
+ const double*) const;
void autofill(const double*, double*, const double*);
};
diff --git a/src/glm_levenberg.cpp b/src/glm_levenberg.cpp
index 4a9788f..af11c37 100644
--- a/src/glm_levenberg.cpp
+++ b/src/glm_levenberg.cpp
@@ -15,13 +15,13 @@ double glm_levenberg::nb_deviance (const double* y, const double* mu,
#ifdef WEIGHTED
const double* w,
#endif
- const double& phi) const {
+ const double* phi) const {
double tempdev=0;
for (int i=0; i<nlibs; ++i) {
#ifdef WEIGHTED
- tempdev+=w[i]*compute_unit_nb_deviance(y[i], mu[i], phi);
+ tempdev+=w[i]*compute_unit_nb_deviance(y[i], mu[i], phi[i]);
#else
- tempdev+=compute_unit_nb_deviance(y[i], mu[i], phi);
+ tempdev+=compute_unit_nb_deviance(y[i], mu[i], phi[i]);
#endif
}
return tempdev;
@@ -77,7 +77,7 @@ int glm_levenberg::fit(const double* offset, const double* y,
#ifdef WEIGHTED
const double* w,
#endif
- const double& disp, double* mu, double* beta) {
+ const double* disp, double* mu, double* beta) {
// We expect 'mu' and 'beta' to be supplied. We then check the maximum value of the counts.
double ymax=0;
for (int lib=0; lib<nlibs; ++lib) {
@@ -124,7 +124,7 @@ int glm_levenberg::fit(const double* offset, const double* y,
*/
for (int row=0; row<nlibs; ++row) {
const double& cur_mu=mu[row];
- const double denom=(1+cur_mu*disp);
+ const double denom=(1+cur_mu*disp[row]);
#ifdef WEIGHTED
const double weight=cur_mu/denom*w[row];
const double deriv=(y[row]-cur_mu)/denom*w[row];
diff --git a/src/glm_one_group.cpp b/src/glm_one_group.cpp
index 8daa626..8bb6643 100644
--- a/src/glm_one_group.cpp
+++ b/src/glm_one_group.cpp
@@ -4,7 +4,7 @@ std::pair<double,bool> glm_one_group(const int& nlibs, const int& maxit, const d
#ifdef WEIGHTED
const double* weights,
#endif
- const double* y, const double& disp, double cur_beta) {
+ const double* y, const double* disp, double cur_beta) {
/* Setting up initial values for beta as the log of the mean of the ratio of counts to offsets.
* This is the exact solution for the gamma distribution (which is the limit of the NB as
* the dispersion goes to infinity. However, if cur_beta is not NA, then we assume it's good.
@@ -48,7 +48,7 @@ std::pair<double,bool> glm_one_group(const int& nlibs, const int& maxit, const d
dl=0;
info=0;
for (int j=0; j<nlibs; ++j) {
- const double mu=std::exp(cur_beta+offset[j]), denominator=1+mu*disp;
+ const double mu=std::exp(cur_beta+offset[j]), denominator=1+mu*disp[j];
#ifdef WEIGHTED
dl+=(y[j]-mu)/denominator * weights[j];
info+=mu/denominator * weights[j];
diff --git a/src/matvec_check.cpp b/src/matvec_check.cpp
index 6d5336a..59a6593 100644
--- a/src/matvec_check.cpp
+++ b/src/matvec_check.cpp
@@ -1,21 +1,10 @@
#include "matvec_check.h"
matvec_check::matvec_check(const int nlib, const int nlen, SEXP incoming, const bool transposed,
- const char* err, const bool nullok) : mycheck(NULL), temp(NULL), isvec(true), istran(transposed),
+ const char* err, const double nullfill) : mycheck(NULL), temp(NULL), isvec(true), istran(transposed),
nl(nlib), nt(nlen), tagdex(0), libdex(0) {
- // Checking if NULL (and whether it's allowed). If it is, it becomes a vector of 1's.
- std::stringstream failed;
- if (incoming==R_NilValue) {
- if (!nullok) {
- failed << err << " vector or matrix cannot be null";
- throw std::runtime_error(failed.str());
- }
- temp=new double[nl];
- for (int i=0; i<nl; ++i) { temp[i]=1; }
- mycheck=temp;
- return;
- }
+ std::stringstream failed;
if (!isNumeric(incoming)) {
failed << err << " vector or matrix should be double precision";
throw std::runtime_error(failed.str());
@@ -25,13 +14,8 @@ matvec_check::matvec_check(const int nlib, const int nlen, SEXP incoming, const
mycheck=REAL(incoming);
const int curlen=LENGTH(incoming);
if (curlen==0) {
- // If it's empty, it's treated as a null.
- if (!nullok) {
- failed << err << " vector or matrix cannot be empty";
- throw std::runtime_error(failed.str());
- }
temp=new double[nl];
- for (int i=0; i<nl; ++i) { temp[i]=1; }
+ for (int i=0; i<nl; ++i) { temp[i]=nullfill; }
mycheck=temp;
} else if (curlen!=nl) {
isvec=false;
diff --git a/src/matvec_check.h b/src/matvec_check.h
index 17d1b99..657c586 100644
--- a/src/matvec_check.h
+++ b/src/matvec_check.h
@@ -4,7 +4,7 @@
class matvec_check {
public:
- matvec_check(const int, const int, SEXP, const bool, const char*, const bool);
+ matvec_check(const int, const int, SEXP, const bool, const char*, const double=0);
~matvec_check();
void advance();
const double* const* access() const;
diff --git a/tests/edgeR-Tests.Rout.save b/tests/edgeR-Tests.Rout.save
index d9479a2..b0c9742 100644
--- a/tests/edgeR-Tests.Rout.save
+++ b/tests/edgeR-Tests.Rout.save
@@ -147,7 +147,177 @@ Tag.21 -1.7266049 13.38327 0.03545446 0.3899990
Tag.6 -1.6329841 12.81479 0.10632987 0.7797524
Tag.2 4.0861092 11.54121 0.16057893 0.8831841
Tag.16 0.9324935 13.57074 0.26348818 0.9658389
-Tag.20
+Tag.20 0.8543140 13.76364 0.31674090 0.9658389
+Tag.19 -0.7976354 13.31405 0.35564858 0.9658389
+Tag.3 -0.7300593 13.54155 0.38833737 0.9658389
+Tag.12 0.7081041 14.31389 0.41513004 0.9658389
+Tag.8 -0.7918152 12.86353 0.48483449 0.9658389
+>
+> # mglmOneWay
+> design <- model.matrix(~group,data=d$samples)
+> mglmOneWay(d[1:10,],design,dispersion=0.2)
+$coefficients
+ (Intercept) group2
+ [1,] -1.000000e+08 0.000000e+00
+ [2,] -1.000000e+08 1.000000e+08
+ [3,] 2.525729e+00 -5.108256e-01
+ [4,] 2.525729e+00 1.484200e-01
+ [5,] 2.140066e+00 -1.941560e-01
+ [6,] 2.079442e+00 -1.163151e+00
+ [7,] 2.014903e+00 2.363888e-01
+ [8,] 1.945910e+00 -5.596158e-01
+ [9,] 1.504077e+00 2.006707e-01
+[10,] 2.302585e+00 2.623643e-01
+
+$fitted.values
+ [,1] [,2] [,3] [,4]
+ [1,] 0.0 0.0 0.0 0.0
+ [2,] 0.0 0.0 2.0 2.0
+ [3,] 12.5 12.5 7.5 7.5
+ [4,] 12.5 12.5 14.5 14.5
+ [5,] 8.5 8.5 7.0 7.0
+ [6,] 8.0 8.0 2.5 2.5
+ [7,] 7.5 7.5 9.5 9.5
+ [8,] 7.0 7.0 4.0 4.0
+ [9,] 4.5 4.5 5.5 5.5
+[10,] 10.0 10.0 13.0 13.0
+
+> mglmOneWay(d[1:10,],design,dispersion=0)
+$coefficients
+ (Intercept) group2
+ [1,] -1.000000e+08 0.000000e+00
+ [2,] -1.000000e+08 1.000000e+08
+ [3,] 2.525729e+00 -5.108256e-01
+ [4,] 2.525729e+00 1.484200e-01
+ [5,] 2.140066e+00 -1.941560e-01
+ [6,] 2.079442e+00 -1.163151e+00
+ [7,] 2.014903e+00 2.363888e-01
+ [8,] 1.945910e+00 -5.596158e-01
+ [9,] 1.504077e+00 2.006707e-01
+[10,] 2.302585e+00 2.623643e-01
+
+$fitted.values
+ [,1] [,2] [,3] [,4]
+ [1,] 0.0 0.0 0.0 0.0
+ [2,] 0.0 0.0 2.0 2.0
+ [3,] 12.5 12.5 7.5 7.5
+ [4,] 12.5 12.5 14.5 14.5
+ [5,] 8.5 8.5 7.0 7.0
+ [6,] 8.0 8.0 2.5 2.5
+ [7,] 7.5 7.5 9.5 9.5
+ [8,] 7.0 7.0 4.0 4.0
+ [9,] 4.5 4.5 5.5 5.5
+[10,] 10.0 10.0 13.0 13.0
+
+>
+> fit <- glmFit(d,design,dispersion=d$common.dispersion,prior.count=0.5/4)
+> lrt <- glmLRT(fit,coef=2)
+> topTags(lrt)
+Coefficient: group2
+ logFC logCPM LR PValue FDR
+Tag.17 2.0450964 13.73726 6.0485417 0.01391779 0.3058698
+Tag.2 4.0861092 11.54121 4.8400340 0.02780635 0.3058698
+Tag.21 -1.7265870 13.38327 4.0777825 0.04345065 0.3186381
+Tag.6 -1.6329986 12.81479 3.0078205 0.08286364 0.4557500
+Tag.16 0.9324996 13.57074 1.3477682 0.24566867 0.8276702
+Tag.20 0.8543138 13.76364 1.1890032 0.27553071 0.8276702
+Tag.19 -0.7976602 13.31405 0.9279151 0.33540526 0.8276702
+Tag.12 0.7081170 14.31389 0.9095513 0.34023349 0.8276702
+Tag.3 -0.7300410 13.54155 0.8300307 0.36226364 0.8276702
+Tag.8 -0.7917906 12.86353 0.7830377 0.37621371 0.8276702
+>
+> fit <- glmFit(d,design,dispersion=d$common.dispersion,prior.count=0.5)
+> summary(fit$coef)
+ (Intercept) group2
+ Min. :-7.604 Min. :-1.13681
+ 1st Qu.:-4.895 1st Qu.:-0.32341
+ Median :-4.713 Median : 0.15083
+ Mean :-4.940 Mean : 0.07817
+ 3rd Qu.:-4.524 3rd Qu.: 0.35163
+ Max. :-4.107 Max. : 1.60864
+>
+> fit <- glmFit(d,design,prior.count=0.5/4)
+> lrt <- glmLRT(fit,coef=2)
+> topTags(lrt)
+Coefficient: group2
+ logFC logCPM LR PValue FDR
+Tag.17 2.0450964 13.73726 6.0485417 0.01391779 0.3058698
+Tag.2 4.0861092 11.54121 4.8400340 0.02780635 0.3058698
+Tag.21 -1.7265870 13.38327 4.0777825 0.04345065 0.3186381
+Tag.6 -1.6329986 12.81479 3.0078205 0.08286364 0.4557500
+Tag.16 0.9324996 13.57074 1.3477682 0.24566867 0.8276702
+Tag.20 0.8543138 13.76364 1.1890032 0.27553071 0.8276702
+Tag.19 -0.7976602 13.31405 0.9279151 0.33540526 0.8276702
+Tag.12 0.7081170 14.31389 0.9095513 0.34023349 0.8276702
+Tag.3 -0.7300410 13.54155 0.8300307 0.36226364 0.8276702
+Tag.8 -0.7917906 12.86353 0.7830377 0.37621371 0.8276702
+>
+> dglm <- estimateGLMCommonDisp(d,design)
+> dglm$common.dispersion
+[1] 0.2033282
+> dglm <- estimateGLMTagwiseDisp(dglm,design,prior.df=20)
+> summary(dglm$tagwise.dispersion)
+ Min. 1st Qu. Median Mean 3rd Qu. Max.
+ 0.1756 0.1879 0.1998 0.2031 0.2135 0.2578
+> fit <- glmFit(dglm,design,prior.count=0.5/4)
+> lrt <- glmLRT(fit,coef=2)
+> topTags(lrt)
+Coefficient: group2
+ logFC logCPM LR PValue FDR
+Tag.17 2.0450988 13.73726 6.8001118 0.009115216 0.2005348
+Tag.2 4.0861092 11.54121 4.8594088 0.027495756 0.2872068
+Tag.21 -1.7265904 13.38327 4.2537154 0.039164558 0.2872068
+Tag.6 -1.6329904 12.81479 3.1763761 0.074710253 0.4109064
+Tag.16 0.9324970 13.57074 1.4126709 0.234613512 0.8499599
+Tag.20 0.8543183 13.76364 1.2721097 0.259371274 0.8499599
+Tag.19 -0.7976614 13.31405 0.9190392 0.337727381 0.8499599
+Tag.12 0.7081163 14.31389 0.9014515 0.342392806 0.8499599
+Tag.3 -0.7300488 13.54155 0.8817937 0.347710872 0.8499599
+Tag.8 -0.7918166 12.86353 0.7356185 0.391068049 0.8603497
+> dglm <- estimateGLMTrendedDisp(dglm,design)
+> summary(dglm$trended.dispersion)
+ Min. 1st Qu. Median Mean 3rd Qu. Max.
+ 0.1522 0.1676 0.1740 0.1887 0.2000 0.3469
+> dglm <- estimateGLMTrendedDisp(dglm,design,method="power")
+> summary(dglm$trended.dispersion)
+ Min. 1st Qu. Median Mean 3rd Qu. Max.
+ 0.1522 0.1676 0.1740 0.1887 0.2000 0.3469
+> dglm <- estimateGLMTrendedDisp(dglm,design,method="spline")
+Loading required package: splines
+> summary(dglm$trended.dispersion)
+ Min. 1st Qu. Median Mean 3rd Qu. Max.
+0.09353 0.11080 0.15460 0.19010 0.23050 0.52010
+> dglm <- estimateGLMTrendedDisp(dglm,design,method="bin.spline")
+> summary(dglm$trended.dispersion)
+ Min. 1st Qu. Median Mean 3rd Qu. Max.
+ 0.1997 0.1997 0.1997 0.1997 0.1997 0.1997
+> dglm <- estimateGLMTagwiseDisp(dglm,design,prior.df=20)
+> summary(dglm$tagwise.dispersion)
+ Min. 1st Qu. Median Mean 3rd Qu. Max.
+ 0.1385 0.1792 0.1964 0.1935 0.2026 0.2709
+>
+> dglm2 <- estimateDisp(dglm, design)
+> summary(dglm2$tagwise.dispersion)
+ Min. 1st Qu. Median Mean 3rd Qu. Max.
+ 0.1766 0.1789 0.1814 0.1846 0.1870 0.2119
+> dglm2 <- estimateDisp(dglm, design, prior.df=20)
+> summary(dglm2$tagwise.dispersion)
+ Min. 1st Qu. Median Mean 3rd Qu. Max.
+ 0.1527 0.1669 0.1814 0.1858 0.1951 0.2497
+> dglm2 <- estimateDisp(dglm, design, robust=TRUE)
+Loading required package: statmod
+> summary(dglm2$tagwise.dispersion)
+ Min. 1st Qu. Median Mean 3rd Qu. Max.
+ 0.1766 0.1789 0.1814 0.1846 0.1870 0.2119
+>
+> # Continuous trend
+> nlibs <- 3
+> ntags <- 1000
+> dispersion.true <- 0.1
+> # Make first transcript respond to covariate x
+> x <- 0:2
+> design <- model.matrix(~x)
+> beta.true <- cbind(Beta1=2,Beta2=c(2,rep(0,ntags-1)))
> mu.true <- 2^(beta.true %*% t(design))
> # Generate count data
> y <- rnbinom(ntags*nlibs,mu=mu.true,size=1/dispersion.true)
@@ -392,52 +562,3 @@ $n0
> proc.time()
user system elapsed
3.58 0.04 5.32
- edgeR/vignettes/ 0000755 0001263 0001264 00000000000 1 [...]
-%\VignetteKeyword{RNA-Seq}
-%\VignetteKeyword{differential expression}
-%\VignettePackage{edgeR}
-\documentclass[12pt]{article}
-
-\textwidth=6.2in
-\textheight=8.5in
-\oddsidemargin=0.2in
-\evensidemargin=0.2in
-\headheight=0in
-\headsep=0in
-
-\begin{document}
-
-\title{edgeR Package Introduction}
-\author{Yunshun Chen, Davis McCarthy, Aaron Lun,\\
-Xiaobei Zhou, Mark Robinson, Gordon K.\ Smyth}
-\date{10 October 2012\\
-Revised 8 October 2014}
-\maketitle
-
-
-edgeR is a package for the differential expression analysis of digital gene expression data,
-that is, of count data arising from DNA sequencing technologies.
-It is especially designed for differential expression analyses of RNA-Seq or SAGE data,
-or differential marking analyses of ChIP-Seq data.
-
-edgeR implements novel statistical methods based on the negative binomial distribution
-as a model for count variability, including empirical Bayes methods, exact tests, and generalized linear models.
-The package is especially suitable for analysing designed experiments with multiple
-experimental factors but possibly small numbers of replicates.
-It has unique abilities to model transcript specific variation even in small samples,
-a capability essential for prioritizing genes or transcripts that have consistent effects across replicates.
-
-The full edgeR User's Guide is available as part of the online documentation.
-To reach the User's Guide, install the edgeR package and load it into an R session by \texttt{library(edgeR)}.
-In R for Windows, the User's Guide will then be available from the drop-down menu called ``Vignettes''.
-In other operating systems, type
-\begin{Schunk}
-\begin{Sinput}
-> library(edgeR)
-> edgeRUsersGuide()
-\end{Sinput}
-\end{Schunk}
-at the R prompt to open the User's Guide in a pdf viewer.
-
-\end{document}
- [...]
\ No newline at end of file
diff --git a/vignettes/edgeR.Rnw b/vignettes/edgeR.Rnw
new file mode 100755
index 0000000..af5781e
--- /dev/null
+++ b/vignettes/edgeR.Rnw
@@ -0,0 +1,48 @@
+%\VignetteIndexEntry{edgeR Vignette}
+%\VignetteKeyword{RNA-Seq}
+%\VignetteKeyword{differential expression}
+%\VignettePackage{edgeR}
+\documentclass[12pt]{article}
+
+\textwidth=6.2in
+\textheight=8.5in
+\oddsidemargin=0.2in
+\evensidemargin=0.2in
+\headheight=0in
+\headsep=0in
+
+\begin{document}
+
+\title{edgeR Package Introduction}
+\author{Yunshun Chen, Davis McCarthy, Aaron Lun,\\
+Xiaobei Zhou, Mark Robinson, Gordon K.\ Smyth}
+\date{10 October 2012\\
+Revised 8 October 2014}
+\maketitle
+
+
+edgeR is a package for the differential expression analysis of digital gene expression data,
+that is, of count data arising from DNA sequencing technologies.
+It is especially designed for differential expression analyses of RNA-Seq or SAGE data,
+or differential marking analyses of ChIP-Seq data.
+
+edgeR implements novel statistical methods based on the negative binomial distribution
+as a model for count variability, including empirical Bayes methods, exact tests, and generalized linear models.
+The package is especially suitable for analysing designed experiments with multiple
+experimental factors but possibly small numbers of replicates.
+It has unique abilities to model transcript specific variation even in small samples,
+a capability essential for prioritizing genes or transcripts that have consistent effects across replicates.
+
+The full edgeR User's Guide is available as part of the online documentation.
+To reach the User's Guide, install the edgeR package and load it into an R session by \texttt{library(edgeR)}.
+In R for Windows, the User's Guide will then be available from the drop-down menu called ``Vignettes''.
+In other operating systems, type
+\begin{Schunk}
+\begin{Sinput}
+> library(edgeR)
+> edgeRUsersGuide()
+\end{Sinput}
+\end{Schunk}
+at the R prompt to open the User's Guide in a pdf viewer.
+
+\end{document}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/r-bioc-edger.git
More information about the debian-med-commit
mailing list