[med-svn] [r-bioc-edger] 01/06: Imported Upstream version 3.8.0+dfsg
Andreas Tille
tille at debian.org
Fri Oct 17 14:21:21 UTC 2014
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository r-bioc-edger.
commit 0d6d1ce989f8b5b531171dc4a3976c04dbde8bba
Author: Andreas Tille <tille at debian.org>
Date: Thu Oct 16 13:46:25 2014 +0200
Imported Upstream version 3.8.0+dfsg
---
DESCRIPTION | 8 +-
NAMESPACE | 16 +-
R/adjustedProfileLik.R | 17 +-
R/aveLogCPM.R | 5 +-
R/calcNormFactors.R | 48 ++--
R/diffSpliceDGE.R | 253 +++++++++++++++++++
R/equalizeLibSizes.R | 10 +-
R/estimateDisp.R | 107 ++++----
R/exactTestByDeviance.R | 2 +-
R/glmQLFTest.R | 107 ++++----
R/glmfit.R | 12 +-
R/goana.R | 66 +++++
R/goodTuring.R | 2 +-
R/loessByCol.R | 2 +-
R/maximizeInterpolant.R | 2 +-
R/mglmLevenberg.R | 2 +-
R/mglmOneGroup.R | 10 +-
R/mglmOneWay.R | 11 +-
R/nbinomDeviance.R | 7 +-
R/processAmplicons.R | 209 ++++++++++++++++
R/processHairpinReads.R | 124 ---------
R/readDGE.R | 4 +-
R/residDF.R | 10 +-
R/subsetting.R | 8 +-
R/treatDGE.R | 96 +++++++
inst/CITATION | 24 +-
inst/NEWS.Rd | 35 ++-
inst/doc/edgeR.Rnw | 6 +-
inst/doc/edgeR.pdf | Bin 48051 -> 48664 bytes
man/adjustedProfileLik.Rd | 14 +-
man/aveLogCPM.Rd | 2 +-
man/calcNormFactors.Rd | 15 +-
man/diffSpliceDGE.Rd | 71 ++++++
man/equalizeLibSizes.Rd | 22 +-
man/estimateDisp.Rd | 6 +-
man/glmQLFTest.Rd | 115 +++++++++
man/glmfit.Rd | 36 +--
man/goana.Rd | 67 +++++
man/mglm.Rd | 6 +-
man/plotQLDisp.Rd | 48 ++++
man/plotSpliceDGE.Rd | 26 ++
man/processAmplicons.Rd | 61 +++++
man/processHairpinReads.Rd | 55 ----
man/topSpliceDGE.Rd | 33 +++
man/treatDGE.Rd | 62 +++++
src/Makevars | 3 -
src/R_compute_nbdev.cpp | 19 +-
src/R_cr_adjust.cpp | 21 +-
src/R_exact_test_by_deviance.cpp | 25 +-
src/R_levenberg.cpp | 53 ++--
src/R_loess_by_col.cpp | 24 +-
src/R_maximize_interpolant.cpp | 18 +-
src/R_one_group.cpp | 56 ++---
src/R_process_hairpin_reads.c | 511 +++++++++++++++++++++++++-------------
src/R_simple_good_turing.cpp | 24 +-
src/glm.h | 2 +-
src/glm_one_group.cpp | 44 ++--
src/init.cpp | 30 +++
src/matvec_check.cpp | 4 +-
src/utils.h | 30 ++-
tests/edgeR-Tests.R | 14 ++
tests/edgeR-Tests.Rout.save | 197 +++++++++++----
{inst/doc => vignettes}/edgeR.Rnw | 6 +-
63 files changed, 2140 insertions(+), 783 deletions(-)
diff --git a/DESCRIPTION b/DESCRIPTION
index 1060425..dd553ea 100755
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,8 +1,8 @@
Package: edgeR
-Version: 3.6.8
-Date: 2014/08/14
+Version: 3.8.0
+Date: 2014/10/10
Title: Empirical analysis of digital gene expression data in R
-Author: Yunshun Chen <yuchen at wehi.edu.au>, Davis McCarthy <dmccarthy at wehi.edu.au>, Aaron Lun <alun at wehi.edu.au>, Mark Robinson <mark.robinson at imls.uzh.ch>, Xiaobei Zhou <xiaobei.zhou at uzh.ch>, Gordon Smyth <smyth at wehi.edu.au>
+Author: Yunshun Chen <yuchen at wehi.edu.au>, Davis McCarthy <dmccarthy at wehi.edu.au>, Aaron Lun <alun at wehi.edu.au>, Xiaobei Zhou <xiaobei.zhou at uzh.ch>, Mark Robinson <mark.robinson at imls.uzh.ch>, Gordon Smyth <smyth at wehi.edu.au>
Maintainer: Yunshun Chen <yuchen at wehi.edu.au>, Mark Robinson <mark.robinson at imls.uzh.ch>, Davis McCarthy <dmccarthy at wehi.edu.au>, Gordon Smyth <smyth at wehi.edu.au>
License: GPL (>=2)
Depends: R (>= 2.15.0), limma
@@ -15,4 +15,4 @@ biocViews: GeneExpression, Transcription, AlternativeSplicing,
TimeCourse, SAGE, Sequencing, ChIPSeq, RNASeq, BatchEffect,
MultipleComparison, Normalization, QualityControl
Description: Differential expression analysis of RNA-seq and digital gene expression profiles with biological replication. Uses empirical Bayes estimation and exact tests based on the negative binomial distribution. Also useful for differential signal analysis with other types of genome-scale count data.
-Packaged: 2014-08-15 02:04:09 UTC; biocbuild
+Packaged: 2014-10-14 01:06:08 UTC; biocbuild
diff --git a/NAMESPACE b/NAMESPACE
index 37860d8..4e884e7 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,5 +1,5 @@
# Calling the dynamic library.
-useDynLib(edgeR)
+useDynLib(edgeR, .registration=TRUE, .fixes=".c")
# All functions exported other than those starting with "."
exportPattern("^[^\\.]")
@@ -8,11 +8,13 @@ exportClasses("DGEList","DGEExact","DGEGLM","DGELRT","TopTags")
exportMethods("show")
import(methods)
-importFrom("limma",camera,is.fullrank,loessFit,mroast,nonEstimable,plotMDS,roast,squeezeVar,subsetListOfArrays)
+importFrom("limma",camera,goana,is.fullrank,loessFit,mroast,nonEstimable,plotMDS,roast,squeezeVar,subsetListOfArrays)
importClassesFrom("limma","LargeDataObject","Roast","MDS")
S3method(as.data.frame,TopTags)
S3method(as.matrix,DGEList)
+S3method(calcNormFactors,default)
+S3method(calcNormFactors,DGEList)
S3method(dim,DGEList)
S3method(dim,DGEExact)
S3method(dim,DGEGLM)
@@ -34,13 +36,23 @@ S3method(plotMDS,DGEList)
S3method(camera,DGEList)
S3method(roast,DGEList)
S3method(mroast,DGEList)
+S3method(aveLogCPM,default)
S3method(aveLogCPM,DGEList)
S3method(aveLogCPM,DGEGLM)
S3method(cpm,DGEList)
+S3method(estimateGLMCommonDisp,default)
S3method(estimateGLMCommonDisp,DGEList)
+S3method(estimateGLMTrendedDisp,default)
S3method(estimateGLMTrendedDisp,DGEList)
+S3method(estimateGLMTagwiseDisp,default)
S3method(estimateGLMTagwiseDisp,DGEList)
+S3method(glmFit,default)
S3method(glmFit,DGEList)
+S3method(predFC,default)
S3method(predFC,DGEList)
+S3method(rpkm,default)
S3method(rpkm,DGEList)
+S3method(sumTechReps,default)
S3method(sumTechReps,DGEList)
+S3method(goana,DGEExact)
+S3method(goana,DGELRT)
diff --git a/R/adjustedProfileLik.R b/R/adjustedProfileLik.R
index 200fcb2..099e032 100644
--- a/R/adjustedProfileLik.R
+++ b/R/adjustedProfileLik.R
@@ -1,10 +1,10 @@
-adjustedProfileLik <- function(dispersion, y, design, offset, weights=NULL, adjust=TRUE)
+adjustedProfileLik <- function(dispersion, y, design, offset, weights=NULL, adjust=TRUE, start=NULL, get.coef=FALSE)
# tagwise Cox-Reid adjusted profile likelihoods for the dispersion
# dispersion can be scalar or tagwise vector
# y is matrix: rows are genes/tags/transcripts, columns are samples/libraries
# offset is matrix of the same dimensions as y
# Yunshun Chen, Gordon Smyth, Aaron Lun
-# Created June 2010. Last modified 17 Feb 2014.
+# Created June 2010. Last modified 11 Sep 2014.
{
if(any(dim(y)!=dim(offset))) offset <- expandAsMatrix(offset,dim(y))
ntags <- nrow(y)
@@ -13,7 +13,7 @@ adjustedProfileLik <- function(dispersion, y, design, offset, weights=NULL, adju
# Fit tagwise linear models. This is actually the most time-consuming
# operation that I can see for this function.
- fit <- glmFit(y,design=design,dispersion=dispersion,offset=offset,prior.count=0,weights=weights)
+ fit <- glmFit(y,design=design,dispersion=dispersion,offset=offset,prior.count=0,weights=weights,start=start)
# Compute log-likelihood.
mu <- fit$fitted
@@ -46,10 +46,15 @@ adjustedProfileLik <- function(dispersion, y, design, offset, weights=NULL, adju
# Note the use of a transposed matrix for easy row access in column-major format.
if (!is.double(W)) storage.mode(W)<-"double"
if (!is.double(design)) storage.mode(design)<-"double"
- cr <- .Call("R_cr_adjust", t(W), design, nrow(design), PACKAGE="edgeR")
+ cr <- .Call(.cR_cr_adjust, t(W), design, nrow(design))
if (is.character(cr)) { stop(cr) }
}
-
- return(loglik - cr)
+
+# Deciding what to return.
+ if (get.coef) {
+ return(list(apl=loglik-cr, beta=fit$coefficients))
+ } else {
+ return(loglik - cr)
+ }
}
diff --git a/R/aveLogCPM.R b/R/aveLogCPM.R
index 35f16ed..5b88d53 100644
--- a/R/aveLogCPM.R
+++ b/R/aveLogCPM.R
@@ -37,11 +37,12 @@ aveLogCPM.DGEGLM <- function(y, prior.count=2, dispersion=NULL, ...)
aveLogCPM.default <- function(y,lib.size=NULL,offset=NULL,prior.count=2,dispersion=NULL,weights=NULL, ...)
# log2(AveCPM)
# Gordon Smyth
-# Created 25 Aug 2012. Last modified 12 July 2014.
+# Created 25 Aug 2012. Last modified 25 Sep 2014.
{
# Check y
y <- as.matrix(y)
- if(any(y<0)) stop("y must be non-negative")
+ if(any(is.na(y))) stop("NA counts not allowed")
+ if(any(y<0)) stop("counts must be non-negative")
# Check prior.count
if(prior.count<0) prior.count <- 0
diff --git a/R/calcNormFactors.R b/R/calcNormFactors.R
index 9890a1c..10de754 100644
--- a/R/calcNormFactors.R
+++ b/R/calcNormFactors.R
@@ -1,18 +1,37 @@
-calcNormFactors <- function(object, method=c("TMM","RLE","upperquartile","none"), refColumn=NULL, logratioTrim=.3, sumTrim=0.05, doWeighting=TRUE, Acutoff=-1e10, p=0.75)
+calcNormFactors <- function(object, ...)
+UseMethod("calcNormFactors")
+
+calcNormFactors.DGEList <- function(object, method=c("TMM","RLE","upperquartile","none"), refColumn=NULL, logratioTrim=.3, sumTrim=0.05, doWeighting=TRUE, Acutoff=-1e10, p=0.75, ...)
# Scale normalization of RNA-Seq data.
# Mark Robinson. Edits by Gordon Smyth.
-# Created October 22 October 2009. Last modified 16 Apr 2013.
+# Created October 22 October 2009. Last modified 2 Oct 2014.
{
# Check object
- if(is(object,"DGEList")) {
- x <- as.matrix(object$counts)
- lib.size <- object$samples$lib.size
- } else {
- x <- as.matrix(object)
- lib.size <- colSums(x)
- }
-
-# Check method
+ x <- as.matrix(object$counts)
+ lib.size <- object$samples$lib.size
+ if(any(is.na(x))) stop("NAs not permitted")
+
+ f <- NextMethod(object=x, lib.size=lib.size, method=method, refColumn=refColumn, logratioTrim=logratioTrim, sumTrim=sumTrim, doWeighting=doWeighting, Acutoff=Acutoff, p=p)
+
+# Output
+ object$samples$norm.factors <- f
+ return(object)
+
+}
+
+calcNormFactors.default <- function(object, lib.size=NULL, method=c("TMM","RLE","upperquartile","none"), refColumn=NULL, logratioTrim=.3, sumTrim=0.05, doWeighting=TRUE, Acutoff=-1e10, p=0.75, ...)
+# Scale normalization of RNA-Seq data.
+# Mark Robinson. Edits by Gordon Smyth.
+# Created October 22 October 2009. Last modified 2 Oct 2014.
+{
+# Check object
+ x <- as.matrix(object)
+ if(any(is.na(x))) stop("NAs not permitted")
+
+# Check lib.size
+ if(is.null(lib.size)) lib.size <- colSums(x)
+
+# Check method
method <- match.arg(method)
# Remove all zero rows
@@ -42,12 +61,7 @@ calcNormFactors <- function(object, method=c("TMM","RLE","upperquartile","none")
f <- f/exp(mean(log(f)))
# Output
- if(is(object, "DGEList")) {
- object$samples$norm.factors <- f
- return(object)
- } else {
- return(f)
- }
+ f
}
diff --git a/R/diffSpliceDGE.R b/R/diffSpliceDGE.R
new file mode 100644
index 0000000..4d36e66
--- /dev/null
+++ b/R/diffSpliceDGE.R
@@ -0,0 +1,253 @@
+diffSpliceDGE <- function(fit.exon, coef=ncol(fit.exon$design), geneid, exonid=NULL, verbose=TRUE)
+{
+# Identify exons and genes with splice variants using negative binomial GLMs
+# Yunshun Chen and Gordon Smyth
+# Created 29 March 2014. Last modified 25 Aug 2014.
+
+# Check input (from diffSplice in limma)
+ exon.genes <- fit.exon$genes
+ nexons <- nrow(fit.exon)
+ design <- fit.exon$design
+
+ if(is.null(exon.genes)) exon.genes <- data.frame(ExonID=1:nrow(fit.exon))
+ if(length(geneid)==1) {
+ genecolname <- as.character(geneid)
+ geneid <- exon.genes[[genecolname]]
+ } else {
+ exon.genes$GeneID <- geneid
+ genecolname <- "GeneID"
+ }
+ if(!is.null(exonid))
+ if(length(exonid)==1) {
+ exoncolname <- as.character(exonid)
+ exonid <- exon.genes[[exoncolname]]
+ } else {
+ exon.genes$ExonID <- exonid
+ exoncolname <- "ExonID"
+ }
+ else
+ exoncolname <- NULL
+
+# Sort by geneid (from diffSplice in limma)
+ if(is.null(exonid))
+ o <- order(geneid)
+ else
+ o <- order(geneid,exonid)
+ geneid <- geneid[o]
+ exon.genes <- exon.genes[o,,drop=FALSE]
+
+ fit.exon <- fit.exon[o, ]
+
+# Gene level information
+ gene.counts <- rowsum(fit.exon$counts, geneid, reorder=FALSE)
+ gene.dge <- DGEList(counts=gene.counts, genes=unique(geneid))
+ gene.dge <- estimateDisp(gene.dge, design, robust=FALSE)
+ fit.gene <- glmFit(gene.dge, design)
+
+# Count exons and get genewise variances
+ gene.nexons <- rowsum(rep(1,nexons), geneid, reorder=FALSE)
+ if(verbose) {
+ cat("Total number of exons: ", nexons, "\n")
+ cat("Total number of genes: ", length(gene.nexons), "\n")
+ cat("Number of genes with 1 exon: ", sum(gene.nexons==1), "\n")
+ cat("Mean number of exons in a gene: ", round(mean(gene.nexons),0), "\n")
+ cat("Max number of exons in a gene: ", max(gene.nexons), "\n")
+ }
+
+# Squeeze
+ fit.gene.trend <- glmFit(gene.dge, design=design, dispersion=gene.dge$trended.dispersion)
+ zerofit <- (fit.gene.trend$fitted.values < 1e-4) & (fit.gene.trend$counts < 1e-4)
+ gene.df.residual <- .residDF(zerofit, design)
+ s2 <- fit.gene.trend$deviance / gene.df.residual
+ s2[gene.df.residual==0] <- 0
+ s2 <- pmax(s2,0)
+ s2.fit <- squeezeVar(s2, df=gene.df.residual, covariate=fit.gene.trend$AveLogCPM, robust=FALSE)
+
+# Remove genes with only 1 exon
+ gene.keep <- gene.nexons > 1
+ ngenes <- sum(gene.keep)
+ if(ngenes==0) stop("No genes with more than one exon")
+
+ exon.keep <- rep(gene.keep, gene.nexons)
+ geneid <- geneid[exon.keep]
+ exon.genes <- exon.genes[exon.keep, , drop=FALSE]
+ fit.exon <- fit.exon[exon.keep, ]
+
+ fit.gene <- fit.gene[gene.keep, ]
+ gene.nexons <- gene.nexons[gene.keep]
+ gene.df.test <- gene.nexons-1
+ gene.df.residual <- gene.df.residual[gene.keep]
+
+# Genewise betas
+ g <- rep(1:ngenes, times=gene.nexons)
+ gene.counts.exon <- fit.gene$counts[g, , drop=FALSE]
+ gene.dispersion.exon <- fit.gene$dispersion[g]
+ gene.fit.exon <- glmFit(gene.counts.exon, design=design, dispersion=gene.dispersion.exon, lib.size=gene.dge$samples$lib.size)
+ gene.betabar <- gene.fit.exon$coefficients[, coef, drop=FALSE]
+ offset.new <- fit.exon$offset + gene.betabar %*% t(design[, coef, drop=FALSE])
+ coefficients <- fit.exon$coefficients[, coef, drop=FALSE] - gene.betabar
+
+# Testing
+ design0 <- design[, -coef, drop=FALSE]
+ fit.null <- glmFit(fit.exon$counts, design=design0, offset=offset.new, dispersion=fit.exon$dispersion)
+ fit.alt <- glmFit(fit.exon$counts, design=design, offset=offset.new, dispersion=fit.exon$dispersion)
+
+# Exon p-values
+ exon.LR <- fit.null$deviance - fit.alt$deviance
+ exon.df.test <- fit.null$df.residual - fit.alt$df.residual
+ exon.F <- exon.LR / exon.df.test / s2.fit$var.post[gene.keep][g]
+ gene.df.total <- s2.fit$df.prior + gene.df.residual
+ max.df.residual <- ncol(fit.exon$counts)-ncol(design)
+ gene.df.total <- pmin(gene.df.total, ngenes*max.df.residual)
+ exon.p.value <- pf(exon.F, df1=exon.df.test, df2=gene.df.total[g], lower.tail=FALSE, log.p=FALSE)
+
+ #Ensure is not more significant than chisquare test
+ i <- s2.fit$var.post[gene.keep][g] < 1
+ if(any(i)) {
+ chisq.pvalue <- pchisq(exon.LR[i], df=exon.df.test[i], lower.tail=FALSE, log.p=FALSE)
+ exon.p.value[i] <- pmax(exon.p.value[i], chisq.pvalue)
+ }
+
+# Gene p-values
+
+# Gene Simes' p-values
+ o <- order(g, exon.p.value, decreasing=FALSE)
+ p <- exon.p.value[o]
+ q <- rep(1, sum(gene.nexons))
+ r <- cumsum(q) - rep(cumsum(q)[cumsum(gene.nexons)]-gene.nexons, gene.nexons)
+ pp <- p*rep(gene.nexons, gene.nexons)/r
+ #pp <- p*rep(gene.nexons-1, gene.nexons)/pmax(r-1, 1)
+ oo <- order(-g, pmin(pp,1), decreasing=TRUE)
+ gene.Simes.p.value <- pp[oo][cumsum(gene.nexons)]
+
+# Gene F p-values
+ gene.F <- rowsum(exon.F, geneid, reorder=FALSE) / (gene.df.test)
+ gene.F.p.value <- pf(gene.F, df1=(gene.df.test), df2=gene.df.total, lower.tail=FALSE)
+
+# Output
+ out <- new("DGELRT",list())
+ out$comparison <- colnames(design)[coef]
+ out$design <- design
+ out$coefficients <- as.vector(coefficients)
+
+# Exon level output
+ out$exon.df.test <- exon.df.test
+ out$exon.df.prior <- s2.fit$df.prior[g]
+ out$exon.df.residual <- gene.df.residual[g]
+ out$exon.F <- exon.F
+ out$exon.p.value <- exon.p.value
+ out$genes <- exon.genes
+ out$genecolname <- genecolname
+ out$exoncolname <- exoncolname
+
+# Gene level output
+ out$gene.df.test <- gene.df.test
+ out$gene.df.prior <- s2.fit$df.prior
+ out$gene.df.residual <- gene.df.residual
+ out$gene.Simes.p.value <- gene.Simes.p.value
+ out$gene.F <- gene.F
+ out$gene.F.p.value <- gene.F.p.value
+
+# Which columns of exon.genes contain gene level annotation? (from diffSplice in limma)
+ exon.lastexon <- cumsum(gene.nexons)
+ exon.firstexon <- exon.lastexon-gene.nexons+1
+ no <- logical(nrow(exon.genes))
+ isdup <- vapply(exon.genes,duplicated,no)[-exon.firstexon,,drop=FALSE]
+ isgenelevel <- apply(isdup,2,all)
+ out$gene.genes <- exon.genes[exon.lastexon,isgenelevel, drop=FALSE]
+ out$gene.genes$NExons <- gene.nexons
+
+ out
+}
+
+
+topSpliceDGE <- function(lrt, level="gene", gene.test="Simes", number=10, FDR=1)
+# Yunshun Chen and Gordon Smyth
+# Created 29 March 2014. Last modified 24 September 2014.
+{
+ level <- match.arg(level,c("exon","gene"))
+ gene.test <- match.arg(gene.test,c("Simes","F","f"))
+ if(level=="exon") {
+ number <- min(number, nrow(lrt$genes))
+ P <- lrt$exon.p.value
+ BH <- p.adjust(P, method="BH")
+ if(FDR<1) number <- min(number, sum(BH<FDR))
+ o <- order(P)[1:number]
+ data.frame(lrt$genes[o,,drop=FALSE],logFC=lrt$coefficients[o],F=lrt$exon.F[o],P.Value=P[o],FDR=BH[o])
+ } else {
+ number <- min(number, nrow(lrt$gene.genes))
+ if(gene.test == "Simes") P <- lrt$gene.Simes.p.value else P <- lrt$gene.F.p.value
+ BH <- p.adjust(P, method="BH")
+ if(FDR<1) number <- min(number,sum(BH<FDR))
+ o <- order(P)[1:number]
+ if(gene.test=="Simes")
+ data.frame(lrt$gene.genes[o,,drop=FALSE],P.Value=P[o],FDR=BH[o])
+ else
+ data.frame(lrt$gene.genes[o,,drop=FALSE],F=lrt$gene.F[o],P.Value=P[o],FDR=BH[o])
+ }
+}
+
+
+plotSpliceDGE <- function(lrt, geneid=NULL, rank=1L, FDR = 0.05)
+# Plot exons of most differentially spliced gene
+# Yunshun Chen and Gordon Smyth
+# Created 29 March 2014. Last modified 24 September 2014.
+{
+ # Gene labelling including gene symbol
+ genecolname <- lrt$genecolname
+ genelab <- grep(paste0(genecolname,"|Symbol|symbol"), colnames(lrt$gene.genes), value = T)
+
+ if(is.null(geneid)) {
+ if(rank==1L)
+ i <- which.min(lrt$gene.Simes.p.value)
+ else
+ i <- order(lrt$gene.Simes.p.value)[rank]
+ geneid <- paste(lrt$gene.genes[i,genelab], collapse = ".")
+ } else {
+ i <- which(lrt$gene.genes[,lrt$genecolname]==geneid)
+ geneid <- paste(lrt$gene.genes[i,genelab], collapse = ".")
+ if(!length(i)) stop(paste("geneid",geneid,"not found"))
+ }
+ exon.lastexon <- cumsum(lrt$gene.genes$NExons[1:i])
+ j <- (exon.lastexon[i]-lrt$gene.genes$NExons[i]+1):exon.lastexon[i]
+ exoncolname <- lrt$exoncolname
+ if(is.null(exoncolname)){
+ plot(lrt$coefficients[j], xlab = "Exon", ylab = "logFC (this exon vs the average)", main = geneid, type = "b")
+ }
+ # Plot exons and mark exon ids on the axis
+ if(!is.null(exoncolname)) {
+ exon.id <- lrt$genes[j, exoncolname]
+ xlab <- paste("Exon", exoncolname, sep = " ")
+
+ plot(lrt$coefficients[j], xlab = "", ylab = "logFC (this exon vs the average)", main = geneid,type = "b", xaxt = "n")
+ axis(1, at = 1:length(j), labels = exon.id, las = 2, cex.axis = 0.6)
+ mtext(xlab, side = 1, padj = 5.2)
+
+ # Mark the topSpliced exons
+ top <- topSpliceDGE(lrt, number = Inf, level = "exon", FDR = FDR)
+ m <- which(top[,genecolname] %in% lrt$gene.genes[i,genecolname])
+
+ if(length(m) > 0){
+ if(length(m) == 1) cex <- 1.5 else{
+ abs.fdr <- abs(log10(top$FDR[m]))
+ from <- range(abs.fdr)
+ to <- c(1,2)
+ cex <- (abs.fdr - from[1])/diff(from) * diff(to) + to[1]
+ }
+ mark <- match(top[m, exoncolname], exon.id)
+ points((1:length(j))[mark], lrt$coefficients[j[mark]], col = "red", pch = 16, cex = cex)
+ }
+ }
+ abline(h=0,lty=2)
+}
+
+
+
+
+
+
+
+
+
+
+
diff --git a/R/equalizeLibSizes.R b/R/equalizeLibSizes.R
index 90dd0ba..ff123b6 100644
--- a/R/equalizeLibSizes.R
+++ b/R/equalizeLibSizes.R
@@ -1,8 +1,8 @@
-equalizeLibSizes <- function(object, dispersion=0, common.lib.size=NULL)
+equalizeLibSizes <- function(object, dispersion=NULL, common.lib.size=NULL)
# Normalize counts so that the library sizes can be treated as equal.
# Uses a quantile-to-quantile transformation so that new count counts are equivalent deviates on the equalized scale.
# Davis McCarthy, Gordon Smyth.
-# Created July 2009. Last modified 25 July 2012.
+# Created July 2009. Last modified 24 September 2014.
{
d <- dim(object)
ntags <- d[1]
@@ -10,7 +10,9 @@ equalizeLibSizes <- function(object, dispersion=0, common.lib.size=NULL)
lib.size <- object$samples$lib.size * object$samples$norm.factors
if(is.null(common.lib.size)) common.lib.size <- exp(mean(log(lib.size)))
levs.group <- unique(object$samples$group)
- if(length(dispersion)==1) dispersion <- rep(dispersion,ntags)
+
+ if(is.null(dispersion)) dispersion <- getDispersion(object)
+ if(is.null(dispersion)) dispersion <- 0.05
input.mean <- output.mean <- matrix(0,ntags,nlibs)
for(i in 1:length(levs.group)) {
@@ -18,7 +20,7 @@ equalizeLibSizes <- function(object, dispersion=0, common.lib.size=NULL)
beta <- mglmOneGroup(object$counts[,j,drop=FALSE],dispersion=dispersion,offset=log(lib.size[j]))
lambda <- exp(beta)
input.mean[,j] <- matrix(lambda,ncol=1) %*% matrix(lib.size[j],nrow=1)
- output.mean[,j] <- matrix(lambda, ncol=1) %*% matrix(common.lib.size, nrow=1, ncol=sum(j));
+ output.mean[,j] <- matrix(lambda, ncol=1) %*% matrix(common.lib.size, nrow=1, ncol=sum(j))
}
pseudo <- q2qnbinom(object$counts, input.mean=input.mean, output.mean=output.mean, dispersion=dispersion)
pseudo[pseudo<0] <- 0
diff --git a/R/estimateDisp.R b/R/estimateDisp.R
index 92e688b..0674f17 100644
--- a/R/estimateDisp.R
+++ b/R/estimateDisp.R
@@ -2,29 +2,31 @@
########### Weighted Likelihood Empirical Bayes ##############
##############################################################
-estimateDisp <- function(y, design=NULL, prior.df=NULL, trend.method="locfit", span=NULL, grid.length=21, grid.range=c(-10,10), robust=FALSE, winsor.tail.p=c(0.05,0.1), tol=1e-06)
+estimateDisp <- function(y, design=NULL, prior.df=NULL, trend.method="locfit", span=NULL, min.row.sum=5, grid.length=21, grid.range=c(-10,10), robust=FALSE, winsor.tail.p=c(0.05,0.1), tol=1e-06)
# Estimating dispersion using weighted conditional likelihood empirical Bayes.
# Use GLM approach if a design matrix is given, and classic approach otherwise.
# It calculates a matrix of likelihoods for each gene at a set of dispersion grid points, and then calls WLEB() to do the shrinkage.
-# Yunshun Chen, Gordon Smyth. Created July 2012. Last modified 17 Feb 2014.
+# Yunshun Chen, Gordon Smyth. Created July 2012. Last modified 11 Sep 2014.
{
if( !is(y,"DGEList") ) stop("y must be a DGEList")
group <- y$samples$group <- as.factor(y$samples$group)
- trend <- match.arg(trend.method, c("none", "loess", "locfit", "movingave"))
+ trend.method <- match.arg(trend.method, c("none", "loess", "locfit", "movingave"))
ntags <- nrow(y$counts)
nlibs <- ncol(y$counts)
+
+ offset <- getOffset(y)
+ AveLogCPM <- aveLogCPM(y)
+ offset <- expandAsMatrix(offset, dim(y))
+ # Check for genes with small counts
+ sel <- rowSums(y$counts) >= min.row.sum
+
# Spline points
spline.pts <- seq(from=grid.range[1],to=grid.range[2],length=grid.length)
spline.disp <- 0.1 * 2^spline.pts
grid.vals <- spline.disp/(1+spline.disp)
- l0 <- matrix(0, ntags, grid.length)
-
- offset <- getOffset(y)
- AveLogCPM <- aveLogCPM(y)
- offset <- expandAsMatrix(offset, dim(y))
-
+ l0 <- matrix(0, sum(sel), grid.length)
# Classic edgeR
if(is.null(design)){
@@ -38,23 +40,23 @@ estimateDisp <- function(y, design=NULL, prior.df=NULL, trend.method="locfit", s
y$common.dispersion <- NA
return(y)
}
- pseudo.obj <- y
+ pseudo.obj <- y[sel, ]
- q2q.out <- equalizeLibSizes(y, dispersion=0.01)
+ q2q.out <- equalizeLibSizes(y[sel, ], dispersion=0.01)
pseudo.obj$counts <- q2q.out$pseudo
ysplit <- splitIntoGroups(pseudo.obj)
delta <- optimize(commonCondLogLikDerDelta, interval=c(1e-4,100/(100+1)), tol=tol, maximum=TRUE, y=ysplit, der=0)
delta <- delta$maximum
disp <- delta/(1-delta)
- q2q.out <- equalizeLibSizes(y,dispersion=disp)
+ q2q.out <- equalizeLibSizes(y[sel, ], dispersion=disp)
pseudo.obj$counts <- q2q.out$pseudo
ysplit <- splitIntoGroups(pseudo.obj)
for(j in 1:grid.length) for(i in 1:length(ysplit))
l0[,j] <- condLogLikDerDelta(ysplit[[i]], grid.vals[j], der=0) + l0[,j]
}
- # GLM edgeR
+ # GLM edgeR (using the last fit to hot-start the next fit).
else {
design <- as.matrix(design)
if(ncol(design) >= ncol(y$counts)) {
@@ -62,22 +64,27 @@ estimateDisp <- function(y, design=NULL, prior.df=NULL, trend.method="locfit", s
y$common.dispersion <- NA
return(y)
}
- for(i in 1:grid.length)
- l0[,i] <- adjustedProfileLik(spline.disp[i], y=y$counts, design=design, offset=offset)
-
+ last.beta <- NULL
+ for(i in 1:grid.length) {
+ out <- adjustedProfileLik(spline.disp[i], y=y$counts[sel, ], design=design, offset=offset[sel,], start=last.beta, get.coef=TRUE)
+ l0[,i] <- out$apl
+ last.beta <- out$beta
+ }
}
- out.1 <- WLEB(theta=spline.pts, loglik=l0, covariate=AveLogCPM, trend.method=trend.method, span=span, individual=FALSE, m0.out=TRUE)
+ out.1 <- WLEB(theta=spline.pts, loglik=l0, covariate=AveLogCPM[sel], trend.method=trend.method, span=span, individual=FALSE, m0.out=TRUE)
y$common.dispersion <- 0.1 * 2^out.1$overall
- y$trended.dispersion <- 0.1 * 2^out.1$trend
+ disp.trend <- 0.1 * 2^out.1$trend
+ y$trended.dispersion <- rep( disp.trend[which.min(AveLogCPM[sel])], ntags )
+ y$trended.dispersion[sel] <- disp.trend
y$trend.method <- trend.method
y$AveLogCPM <- AveLogCPM
y$span <- out.1$span
# Calculate prior.df
if(is.null(prior.df)){
- glmfit <- glmFit(y$counts, design, offset=offset, dispersion=y$trended.dispersion, prior.count=0)
+ glmfit <- glmFit(y$counts[sel,], design, offset=offset[sel,], dispersion=disp.trend, prior.count=0)
# Residual deviances
df.residual <- glmfit$df.residual
@@ -90,44 +97,48 @@ estimateDisp <- function(y, design=NULL, prior.df=NULL, trend.method="locfit", s
s2 <- glmfit$deviance / df.residual
s2[df.residual==0] <- 0
s2 <- pmax(s2,0)
- s2.fit <- squeezeVar(s2, df=df.residual, covariate=AveLogCPM, robust=robust, winsor.tail.p=winsor.tail.p)
+ s2.fit <- squeezeVar(s2, df=df.residual, covariate=AveLogCPM[sel], robust=robust, winsor.tail.p=winsor.tail.p)
prior.df <- s2.fit$df.prior
}
ncoefs <- ncol(design)
prior.n <- prior.df/(nlibs-ncoefs)
-
- # Protecting against infinite prior.n's; otherwise, interpolation of a matrix of Inf values will give the smallest value.
- if(!robust){
- # scalar prior.n
- if (prior.n > 1e6) {
- if (trend.method!='none') {
- y$tagwise.dispersion <- y$trended.dispersion
- } else {
- y$tagwise.dispersion <- rep(y$common.dispersion, ntags)
- }
- } else {
- out.2 <- WLEB(theta=spline.pts, loglik=l0, prior.n=prior.n, covariate=AveLogCPM,
- trend.method=trend.method, span=span, overall=FALSE, trend=FALSE, m0=out.1$shared.loglik)
- y$tagwise.dispersion <- 0.1 * 2^out.2$individual
- }
+ if (trend.method!='none') {
+ y$tagwise.dispersion <- y$trended.dispersion
} else {
- # vector prior.n
- i <- prior.n > 1e6
y$tagwise.dispersion <- rep(y$common.dispersion, ntags)
- if (trend.method!='none') {
- y$tagwise.dispersion[i] <- y$trended.dispersion[i]
- }
- if(sum(!i)!=0){
- # Make sure that there are still some genes with finite prior.df
- out.2 <- WLEB(theta=spline.pts, loglik=l0[!i,,drop=FALSE], prior.n=prior.n[!i], covariate=AveLogCPM[!i],
- trend.method=trend.method, span=span, overall=FALSE, trend=FALSE, m0=out.1$shared.loglik[!i,,drop=FALSE])
- y$tagwise.dispersion[!i] <- 0.1 * 2^out.2$individual
+ }
+
+ # Checking if the shrinkage is near-infinite.
+ too.large <- prior.n > 1e6
+ if (!all(too.large)) {
+ temp.n <- prior.n
+ if (any(too.large)) {
+ temp.n[too.large] <- 1e6
+ }
+
+ # Estimating tagwise dispersions
+ out.2 <- WLEB(theta=spline.pts, loglik=l0, prior.n=temp.n, covariate=AveLogCPM[sel],
+ trend.method=trend.method, span=span, overall=FALSE, trend=FALSE, m0=out.1$shared.loglik)
+
+ if (!robust) {
+ y$tagwise.dispersion[sel] <- 0.1 * 2^out.2$individual
+ } else {
+ y$tagwise.dispersion[sel][!too.large] <- 0.1 * 2^out.2$individual[!too.large]
}
}
- y$prior.df <- prior.df
- y$prior.n <- prior.n
+
+ if(!robust){
+ # scalar prior.n
+ y$prior.df <- prior.df
+ y$prior.n <- prior.n
+ } else {
+ # vector prior.n
+ y$prior.df <- y$prior.n <- rep(Inf, ntags)
+ y$prior.df[sel] <- prior.df
+ y$prior.n[sel] <- prior.n
+ }
y
}
@@ -180,7 +191,7 @@ WLEB <- function(theta, loglik, prior.n=5, covariate=NULL, trend.method="locfit"
# weighted empirical Bayes posterior estimates
if(individual){
- prior.n <- expandAsMatrix(as.vector(prior.n), dim(m0))
+ stopifnot(all(is.finite(prior.n)))
l0a <- loglik + prior.n*m0
out$individual <- maximizeInterpolant(theta, l0a)
}
diff --git a/R/exactTestByDeviance.R b/R/exactTestByDeviance.R
index 688564e..c4a468e 100644
--- a/R/exactTestByDeviance.R
+++ b/R/exactTestByDeviance.R
@@ -38,7 +38,7 @@ exactTestByDeviance <- function(y1,y2,dispersion=0)
# Checking the dispersion type.
dispersion<-as.double(dispersion)
- pvals<-.Call("R_exact_test_by_deviance", sum1, sum2, n1, n2, dispersion, PACKAGE="edgeR")
+ pvals<-.Call(.cR_exact_test_by_deviance, sum1, sum2, n1, n2, dispersion)
if (is.character(pvals)) { stop(pvals) }
pmin(pvals, 1)
}
diff --git a/R/glmQLFTest.R b/R/glmQLFTest.R
index dc5d851..dbff67f 100644
--- a/R/glmQLFTest.R
+++ b/R/glmQLFTest.R
@@ -1,29 +1,28 @@
-glmQLFTest <- function(y, design=NULL, dispersion=NULL, coef=ncol(glmfit$design), contrast=NULL, abundance.trend=TRUE, robust=FALSE, winsor.tail.p=c(0.05,0.1), plot=FALSE)
-# Quasi-likelihood F-tests for DGE glms.
-# Davis McCarthy and Gordon Smyth.
-# Created 18 Feb 2011. Last modified 18 Jan 2014.
+glmQLFit <- function(y, design=NULL, dispersion=NULL, abundance.trend=TRUE, robust=FALSE, winsor.tail.p=c(0.05, 0.1), ...)
+# Fits a GLM and computes quasi-likelihood dispersions for each gene.
+# Written by Aaron Lun, based on code by Davis McCarthy and Gordon Smyth
+# Created 15 September 2014. Last modified 17 September 2014.
{
# Initial fit with trended dispersion
- if(is(y,"DGEList")) {
- if(is.null(dispersion)) {
- dispersion <- y$trended.dispersion
- if(is.null(dispersion)) dispersion <- y$common.dispersion
- if(is.null(dispersion)) dispersion <- 0.05
+ if(!is(y,"DGEList")) { stop("y must be a DGEList") }
+ if(is.null(dispersion)) {
+ dispersion <- y$trended.dispersion
+ if(is.null(dispersion)) dispersion <- y$common.dispersion
+ if(is.null(dispersion)) dispersion <- 0.05
+ }
+ glmfit <- glmFit(y,design=design,dispersion=dispersion, ...)
+
+# Setting up the abundances.
+ A <- NULL
+ if(abundance.trend) {
+ if(is.null(y$AveLogCPM)) {
+ A <- aveLogCPM(y) # For consistency with call from estimateDisp.
+ } else {
+ A <- y$AveLogCPM
}
- glmfit <- glmFit(y,design=design,dispersion=dispersion)
- } else {
- glmfit <- y
- disptype <- attr(glmfit$dispersion,"type")
- if(!is.null(disptype)) if(disptype=="tagwise") stop("glmfit should be computed using trended dispersions, not tagwise")
+ glmfit$AveLogCPM <- A
}
-# Call glmLRT to get most of the results that we need for the QL F-test calculations
- out <- glmLRT(glmfit, coef=coef, contrast=contrast)
- if(is.null(out$AveLogCPM)) out$AveLogCPM <- aveLogCPM(glmfit$counts)
-
-# Residual deviances
- df.residual <- glmfit$df.residual
-
# Adjust df.residual for fitted values at zero
zerofit <- (glmfit$fitted.values < 1e-4) & (glmfit$counts < 1e-4)
df.residual <- .residDF(zerofit, design)
@@ -32,34 +31,35 @@ glmQLFTest <- function(y, design=NULL, dispersion=NULL, coef=ncol(glmfit$design)
s2 <- glmfit$deviance / df.residual
s2[df.residual==0] <- 0
s2 <- pmax(s2,0)
- if(abundance.trend) {
- A <- out$AveLogCPM
- } else {
- A <- NULL
- }
s2.fit <- squeezeVar(s2,df=df.residual,covariate=A,robust=robust,winsor.tail.p=winsor.tail.p)
-# Plot
- if(plot) {
- if(!abundance.trend) A <- out$AveLogCPM
- plot(A,sqrt(sqrt(s2)),xlab="Average Log2 CPM",ylab="Quarter-Root Mean Deviance",pch=16,cex=0.2)
- o <- order(A)
- points(A[o],sqrt(sqrt(s2.fit$var.post[o])),pch=16,cex=0.2,col="red")
- lines(A[o],sqrt(sqrt(s2.fit$var.prior[o])),col="blue")
- legend("topright",pch=16,col=c("black","red"),legend=c("Raw","Squeezed"))
- }
+# Storing results
+ glmfit$df.residual <- df.residual
+ glmfit$s2.fit <- s2.fit
+ glmfit$df.prior <- s2.fit$df.prior
+ glmfit
+}
+
+glmQLFTest <- function(glmfit, coef=ncol(glmfit$design), contrast=NULL)
+# Quasi-likelihood F-tests for DGE glms.
+# Davis McCarthy and Gordon Smyth.
+# Created 18 Feb 2011. Last modified 15 Sep 2014.
+{
+ if(!is(glmfit,"DGEGLM")) stop("glmfit must be an DGEGLM object produced by glmQLFit")
+ if(is.null(glmfit$s2.fit)) stop("need to run glmQLFit before glmQLFTest")
+ out <- glmLRT(glmfit, coef=coef, contrast=contrast)
# Compute the QL F-statistic
- F <- out$table$LR / out$df.test / s2.fit$var.post
- df.total <- s2.fit$df.prior+df.residual
+ F <- out$table$LR / out$df.test / glmfit$s2.fit$var.post
+ df.total <- glmfit$s2.fit$df.prior + glmfit$df.residual
max.df.residual <- ncol(glmfit$counts)-ncol(glmfit$design)
- df.total <- pmin(df.total, length(s2)*max.df.residual)
+ df.total <- pmin(df.total, nrow(glmfit)*max.df.residual)
# Compute p-values from the QL F-statistic
F.pvalue <- pf(F, df1=out$df.test, df2=df.total, lower.tail=FALSE, log.p=FALSE)
# Ensure is not more significant than chisquare test
- i <- s2.fit$var.post < 1
+ i <- glmfit$s2.fit$var.post < 1
if(any(i)) {
chisq.pvalue <- pchisq(out$table$LR[i], df=out$df.test[i], lower.tail=FALSE, log.p=FALSE)
F.pvalue[i] <- pmax(F.pvalue[i],chisq.pvalue)
@@ -68,11 +68,32 @@ glmQLFTest <- function(y, design=NULL, dispersion=NULL, coef=ncol(glmfit$design)
out$table$LR <- out$table$PValue <- NULL
out$table$F <- F
out$table$PValue <- F.pvalue
-
- out$df.residual <- df.residual
- out$s2.fit <- s2.fit
- out$df.prior <- s2.fit$df.prior
out$df.total <- df.total
+
out
}
+plotQLDisp <- function(glmfit, xlab="Average Log2 CPM", ylab="Quarter-Root Mean Deviance", pch=16, cex=0.2,
+ col.shrunk="red", col.trend="blue", col.raw="black", ...)
+# Plots the result of QL-based shrinkage.
+# written by Aaron Lun, based on code by Davis McCarthy and Gordon Smyth
+# 15 September 2014
+{
+ A <- glmfit$AveLogCPM
+ if(is.null(A)) A <- aveLogCPM(glmfit)
+ s2 <- glmfit$deviance / glmfit$df.residual
+ if(is.null(glmfit$s2.fit)) { stop("need to run glmQLFit before plotQLDisp") }
+
+ plot(A, sqrt(sqrt(s2)),xlab=xlab, ylab=ylab, pch=pch, cex=cex, col=col.raw, ...)
+ points(A,sqrt(sqrt(glmfit$s2.fit$var.post)),pch=16,cex=0.2,col=col.shrunk)
+ if (length(glmfit$s2.fit$var.prior)==1L) {
+ abline(h=sqrt(sqrt(glmfit$s2.fit$var.prior)), col=col.trend)
+ } else {
+ o <- order(A)
+ lines(A[o],sqrt(sqrt(glmfit$s2.fit$var.prior[o])),col=col.trend)
+ }
+
+ legend("topright",pch=16,col=c(col.raw,col.shrunk,col.trend),legend=c("Raw","Squeezed", "Trend"))
+ invisible(NULL)
+}
+
diff --git a/R/glmfit.R b/R/glmfit.R
index 741254f..cfe1028 100644
--- a/R/glmfit.R
+++ b/R/glmfit.R
@@ -23,7 +23,7 @@ glmFit.default <- function(y, design=NULL, dispersion=NULL, offset=NULL, lib.siz
# Fit negative binomial generalized linear model for each transcript
# to a series of digital expression libraries
# Davis McCarthy and Gordon Smyth
-# Created 17 August 2010. Last modified 26 Nov 2013.
+# Created 17 August 2010. Last modified 11 Sep 2014.
{
# Check y
y <- as.matrix(y)
@@ -57,7 +57,7 @@ glmFit.default <- function(y, design=NULL, dispersion=NULL, offset=NULL, lib.siz
# If the design is equivalent to a oneway layout, use a shortcut algorithm
group <- designAsFactor(design)
if(nlevels(group)==ncol(design)) {
- fit <- mglmOneWay(y,design=design,dispersion=dispersion,offset=offset,weights=weights)
+ fit <- mglmOneWay(y,design=design,dispersion=dispersion,offset=offset,weights=weights,coef.start=start)
fit$deviance <- nbinomDeviance(y=y,mean=fit$fitted.values,dispersion=dispersion,weights=weights)
fit$method <- "oneway"
} else {
@@ -67,7 +67,12 @@ glmFit.default <- function(y, design=NULL, dispersion=NULL, offset=NULL, lib.siz
# Prepare output
fit$counts <- y
- if(prior.count>0) fit$coefficients <- predFC(y,design,offset=offset,dispersion=dispersion,prior.count=prior.count,weights=weights,...)*log(2)
+ if(prior.count>0) {
+ fit$unshrunk.coefficients <- fit$coefficients
+ colnames(fit$unshrunk.coefficients) <- colnames(design)
+ rownames(fit$unshrunk.coefficients) <- rownames(y)
+ fit$coefficients <- predFC(y,design,offset=offset,dispersion=dispersion,prior.count=prior.count,weights=weights,...)*log(2)
+ }
colnames(fit$coefficients) <- colnames(design)
rownames(fit$coefficients) <- rownames(y)
dimnames(fit$fitted.values) <- dimnames(y)
@@ -77,6 +82,7 @@ glmFit.default <- function(y, design=NULL, dispersion=NULL, offset=NULL, lib.siz
fit$offset <- offset
fit$dispersion <- dispersion
fit$weights <- weights
+ fit$prior.count <- prior.count
new("DGEGLM",fit)
}
diff --git a/R/goana.R b/R/goana.R
new file mode 100644
index 0000000..0b44802
--- /dev/null
+++ b/R/goana.R
@@ -0,0 +1,66 @@
+goana.DGELRT <- function(de, geneid = rownames(de), FDR = 0.05, species = "Hs", trend = FALSE, ...)
+# Gene ontology analysis of DE genes from linear model fit
+# Gordon Smyth and Yifang Hu
+# Created 25 August 2014. Last modified 26 August 2014.
+{
+ # Check fit
+ if(is.null(de$table$PValue)) stop("p values not found in fit object")
+ ngenes <- nrow(de)
+
+ # Check geneid
+ # Can be either a vector of IDs or a column name
+ geneid <- as.character(geneid)
+ if(length(geneid) == ngenes) {
+ universe <- geneid
+ } else
+ if(length(geneid) == 1L) {
+ universe <- de$genes[[geneid]]
+ if(is.null(universe)) stop(paste("Column",geneid,"not found in de$genes"))
+ } else
+ stop("geneid has incorrect length")
+
+ # Check trend
+ # Can be logical, or a numeric vector of covariate values, or the name of the column containing the covariate values
+ if(is.logical(trend)) {
+ if(trend) {
+ covariate <- de$table$logCPM
+ if(is.null(covariate)) stop("logCPM not found in fit object")
+ }
+ } else
+ if(is.numeric(trend)) {
+ if(length(trend) != ngenes) stop("If trend is numeric, then length must equal nrow(de)")
+ covariate <- trend
+ trend <- TRUE
+ } else {
+ if(is.character(trend)) {
+ if(length(trend) != 1L) stop("If trend is character, then length must be 1")
+ covariate <- de$genes[[trend]]
+ if(is.null(covariate)) stop(paste("Column",trend,"not found in de$genes"))
+ trend <- TRUE
+ } else
+ stop("trend is neither logical, numeric nor character")
+ }
+
+ # Check FDR
+ if(!is.numeric(FDR) | length(FDR) != 1) stop("FDR must be numeric and of length 1.")
+ if(FDR < 0 | FDR > 1) stop("FDR should be between 0 and 1.")
+
+ # Get up and down DE genes
+ fdr.coef <- p.adjust(de$table$PValue, method = "BH")
+ EG.DE.UP <- universe[fdr.coef < FDR & de$table$logFC > 0]
+ EG.DE.DN <- universe[fdr.coef < FDR & de$table$logFC < 0]
+ de.gene <- list(Up=EG.DE.UP, Down=EG.DE.DN)
+
+ # Fit monotonic cubic spline for DE genes vs. gene.weights
+ if(trend) {
+ PW <- isDE <- rep(0,ngenes)
+ isDE[fdr.coef < FDR] <- 1
+ o <- order(covariate)
+ PW[o] <- tricubeMovingAverage(isDE[o],span=0.5,full.length=TRUE)
+ }
+ if(!trend) PW <- NULL
+
+ NextMethod(de = de.gene, universe = universe, species = species, prior.prob = PW, ...)
+}
+
+goana.DGEExact <- goana.DGELRT
diff --git a/R/goodTuring.R b/R/goodTuring.R
index 20252e8..733de10 100644
--- a/R/goodTuring.R
+++ b/R/goodTuring.R
@@ -34,7 +34,7 @@ goodTuring <- function(x, conf=1.96)
}
# SGT algorithm (no type checking, as that's enforced above)
- out <- .Call("R_simple_good_turing", r, n, conf, PACKAGE="edgeR")
+ out <- .Call(.cR_simple_good_turing, r, n, conf)
if (is.character(out)) { stop(out) }
names(out) <- c("P0","proportion")
diff --git a/R/loessByCol.R b/R/loessByCol.R
index 22e98dc..584b35e 100644
--- a/R/loessByCol.R
+++ b/R/loessByCol.R
@@ -26,7 +26,7 @@ loessByCol <- function(y, x=NULL, span=0.5)
# Passing to the compiled code. Note type checking, otherwise the code will complain.
if (!is.double(y)) storage.mode(y) <- "double"
if (!is.double(x)) x <- as.double(x)
- fitted <- .Call("R_loess_by_col", x, y, ncol(y), nspan, PACKAGE="edgeR")
+ fitted <- .Call(.cR_loess_by_col, x, y, ncol(y), nspan)
if (is.character(fitted)) { stop(fitted) }
# Recover the original order.
diff --git a/R/maximizeInterpolant.R b/R/maximizeInterpolant.R
index cefd5d1..e0b7157 100644
--- a/R/maximizeInterpolant.R
+++ b/R/maximizeInterpolant.R
@@ -19,7 +19,7 @@ maximizeInterpolant <- function( x, y )
# Performing some type checking.
if (!is.double(x)) storage.mode(x)<-"double"
if (!is.double(y)) storage.mode(y)<-"double"
- out<-.Call("R_maximize_interpolant", x, t(y), PACKAGE="edgeR")
+ out<-.Call(.cR_maximize_interpolant, x, t(y))
if (is.character(out)) { stop(out) }
return(out);
}
diff --git a/R/mglmLevenberg.R b/R/mglmLevenberg.R
index 9885a72..b39bfef 100644
--- a/R/mglmLevenberg.R
+++ b/R/mglmLevenberg.R
@@ -59,7 +59,7 @@ mglmLevenberg <- function(y, design, dispersion=0, offset=0, weights=NULL, coef.
if (!is.double(weights)) storage.mode(weights) <- "double"
if (!is.double(beta)) storage.mode(beta) <- "double"
if (!is.double(mu)) storage.mode(mu) <- "double"
- output <- .Call("R_levenberg", nlibs, ngenes, design, t(y), dispersion, offset, weights, beta, mu, tol, maxit, PACKAGE="edgeR")
+ output <- .Call(.cR_levenberg, nlibs, ngenes, design, t(y), dispersion, offset, weights, beta, mu, tol, maxit)
# Check for error condition
if (is.character(output)) { stop(output) }
diff --git a/R/mglmOneGroup.R b/R/mglmOneGroup.R
index 7e18837..f2513f2 100644
--- a/R/mglmOneGroup.R
+++ b/R/mglmOneGroup.R
@@ -1,7 +1,7 @@
-mglmOneGroup <- function(y,dispersion=0,offset=0,weights=NULL,maxit=50,tol=1e-10,verbose=FALSE)
+mglmOneGroup <- function(y,dispersion=0,offset=0,weights=NULL,maxit=50,tol=1e-10,verbose=FALSE,coef.start=NULL)
# Fit single-group negative-binomial glm
# Aaron Lun and Gordon Smyth
-# 18 Aug 2010. Last modified 22 Nov 2013.
+# 18 Aug 2010. Last modified 11 Sep 2014.
{
# Check y
y <- as.matrix(y)
@@ -18,6 +18,9 @@ mglmOneGroup <- function(y,dispersion=0,offset=0,weights=NULL,maxit=50,tol=1e-10
# Check offset
if(typeof(offset) != "double") stop("offset not floating point number")
+# Check starting values
+ if (typeof(coef.start) != "double") storage.mode(coef.start) <- "double"
+
# All-Poisson special case
if(all(dispersion==0) && is.null(weights)) {
N <- exp(offset)
@@ -39,8 +42,7 @@ mglmOneGroup <- function(y,dispersion=0,offset=0,weights=NULL,maxit=50,tol=1e-10
weights <- expandAsMatrix(weights,dim(y))
# Fisher scoring iteration.
-# Matrices are transposed so that values for each tag are in consecutive memory locations in C
- output <- .Call("R_one_group", ntags, nlibs, y, dispersion, offset, weights, maxit, tol, PACKAGE="edgeR")
+ output <- .Call(.cR_one_group, y, dispersion, offset, weights, maxit, tol, coef.start)
# Check error condition
if(is.character(output)) stop(output)
diff --git a/R/mglmOneWay.R b/R/mglmOneWay.R
index c83879d..c0cec77 100644
--- a/R/mglmOneWay.R
+++ b/R/mglmOneWay.R
@@ -10,12 +10,12 @@ designAsFactor <- function(design)
g
}
-mglmOneWay <- function(y,design=NULL,dispersion=0,offset=0,weights=NULL,maxit=50,tol=1e-10)
+mglmOneWay <- function(y,design=NULL,dispersion=0,offset=0,weights=NULL,maxit=50,tol=1e-10,coef.start=NULL)
# Fit multiple negative binomial glms with log link
# by Fisher scoring with
# only a single explanatory factor in the model
# Gordon Smyth
-# 11 March 2011. Last modified 21 Nov 2013.
+# 11 March 2011. Last modified 11 Sep 2014.
{
y <- as.matrix(y)
ntags <- nrow(y)
@@ -32,12 +32,17 @@ mglmOneWay <- function(y,design=NULL,dispersion=0,offset=0,weights=NULL,maxit=50
mu <- matrix(0,ntags,ngroups)
offset <- expandAsMatrix(offset,dim(y))
if(!is.null(weights)) weights <- expandAsMatrix(weights,dim(y))
+
firstjofgroup <- rep(0,ngroups)
+ new.start <- NULL
for (g in 1:ngroups) {
j <- which(group==(levels(group)[g]))
firstjofgroup[g] <- j[1]
- mu[,g] <- mglmOneGroup(y[,j,drop=FALSE],dispersion=dispersion,offset=offset[,j,drop=FALSE],weights=weights[,j,drop=FALSE],maxit=maxit,tol=tol)
+ if (!is.null(coef.start)) { new.start <- coef.start %*% design[firstjofgroup[g],] }
+ mu[,g] <- mglmOneGroup(y[,j,drop=FALSE],dispersion=dispersion,offset=offset[,j,drop=FALSE],weights=weights[,j,drop=FALSE],maxit=maxit,tol=tol,
+ coef.start=new.start)
}
+
designunique <- design[firstjofgroup,,drop=FALSE]
mu1 <- pmax(mu,-1e8)
beta <- t(solve(designunique,t(mu1)))
diff --git a/R/nbinomDeviance.R b/R/nbinomDeviance.R
index 8ea3054..5d2357b 100644
--- a/R/nbinomDeviance.R
+++ b/R/nbinomDeviance.R
@@ -9,7 +9,10 @@ nbinomDeviance <- function(y,mean,dispersion=0,weights=NULL)
if(!is.matrix(y)) y <- array(y, c(1L,length(y)), if(!is.null(names(y))) list(NULL,names(y)))
d <- nbinomUnitDeviance(y=y,mean=mean,dispersion=dispersion)
- if(!is.null(weights)) d <- weights*d
+ if(!is.null(weights)) {
+ weights <- expandAsMatrix(weights, dim(d))
+ d <- weights*d
+ }
rowSums(d)
}
@@ -33,7 +36,7 @@ nbinomUnitDeviance <- function(y,mean,dispersion=0)
if(lend < ntags) dispersion <- rep_len(dispersion, length.out=ntags)
if(lend > ntags && lend < nobs) dispersion <- rep_len(dispersion, length.out=nobs)
- out <- .Call("R_compute_nbdev", y=y, mu=mean, phi=dispersion, PACKAGE="edgeR")
+ out <- .Call(.cR_compute_nbdev, y=y, mu=mean, phi=dispersion)
# Check error status
if (is.character(out)) stop(out)
diff --git a/R/processAmplicons.R b/R/processAmplicons.R
new file mode 100644
index 0000000..50a78d2
--- /dev/null
+++ b/R/processAmplicons.R
@@ -0,0 +1,209 @@
+# Code to process hairpin reads from Illumina sequencer
+# Assume fixed structure of read:
+# Barcode + Common sequence + Hairpin sequence
+# If readfile2, barcodeStartRev and barcodeEndRev are not NULL, reverse read sequences contain reverse barcodes
+
+processAmplicons = function(readfile, readfile2=NULL, barcodefile, hairpinfile,
+ barcodeStart=1, barcodeEnd=5, barcodeStartRev=NULL, barcodeEndRev=NULL, hairpinStart=37, hairpinEnd=57,
+ allowShifting=FALSE, shiftingBase = 3,
+ allowMismatch=FALSE, barcodeMismatchBase = 1, hairpinMismatchBase = 2,
+ allowShiftedMismatch = FALSE,
+ verbose = FALSE) {
+
+ checkFileExistence = function(readfilenames){
+ if ((length(readfilenames) == 1) && (!file.exists(readfilenames)))
+ stop("Read file doesn't exist.\n")
+ if (length(readfilenames) > 1){
+ for(i in 1:length(readfilenames)){
+ if (!file.exists(readfilenames[i]))
+ stop(paste("Read file ", readfilenames[i], " doesn't exist. \n", sep=""))
+ }
+ }
+ }
+
+ # Check file existence
+ checkFileExistence(readfile);
+ if (is.null(readfile2)) {
+ IsPairedReads = FALSE;
+ } else {
+ IsPairedReads = TRUE;
+ if (length(readfile) != length(readfile2))
+ stop("readfile and readfile2 should match each other.")
+ checkFileExistence(readfile2);
+ }
+
+ if (!file.exists(barcodefile))
+ stop("Barcode file doesn't exist.\n")
+ if (!file.exists(hairpinfile))
+ stop("Hairpin file doesn't exist.\n")
+
+ # Validate input parameters
+ reads <- file(readfile[1], "rt");
+ first_read <- readLines(reads, 2)
+ readlength <- nchar(first_read[2])
+
+ if ((barcodeStart < 1) || (barcodeStart > readlength))
+ stop("Invalid barcode start position!\n")
+ if ((barcodeEnd < 1) || (barcodeEnd > readlength))
+ stop("Invalid barcode end position!\n")
+ if (barcodeEnd <= barcodeStart)
+ stop("Barcode end position should be greater than barcode start position. \n")
+ if ((hairpinStart < 1) || (hairpinStart > readlength))
+ stop("Invalid hairpin start position!")
+ if ((hairpinEnd < 1) || (hairpinEnd > readlength))
+ stop("Invalid hairpin end position!")
+ if (hairpinEnd <= hairpinStart)
+ stop("Hairpin end position should be greater than hairpin start position. \n")
+
+ close(reads)
+
+ if (IsPairedReads) {
+ reads <- file(readfile2[1], "rt");
+ first_read <- readLines(reads, 2)
+ readlength2 <- nchar(first_read[2])
+ close(reads)
+
+ if ( (is.null(barcodeStartRev)) || (is.null(barcodeEndRev)) )
+ stop("readfile2 is supplied, barcodeStartRev and barcodeEndRev should be specified. ")
+ if ((barcodeStartRev < 1) || (barcodeStartRev > readlength2))
+ stop("Invalid reverse barcode start position!\n")
+ if ((barcodeEndRev < 1) || (barcodeEndRev > readlength2))
+ stop("Invalid reverse barcode end position!\n")
+ if (barcodeEndRev <= barcodeStartRev)
+ stop("Reverse barcode end position should be greater than reverse barcode start position. \n")
+ }
+
+ # Validate barcodes
+ barcodelength <- barcodeEnd - barcodeStart + 1;
+ barcodes <- read.table(barcodefile, header=TRUE, sep="\t");
+ numbc <- nrow(barcodes)
+ barcodeIDIndex = which(colnames(barcodes) == 'ID')
+ if (length(barcodeIDIndex) < 1)
+ stop("Can't find column ID in ", barcodefile)
+ barcodeseqIndex = which(colnames(barcodes) == 'Sequences')
+ if (length(barcodeseqIndex) < 1)
+ stop("Can't find column Sequences in ", barcodefile)
+ barcodeIDs <- as.character(barcodes[, barcodeIDIndex])
+ barcodeseqs <- as.character(barcodes[, barcodeseqIndex])
+ if (length(unique(barcodeIDs)) != numbc)
+ stop("There are duplicate barcode IDs.\n")
+ if ((min(nchar(barcodeseqs)) != barcodelength) || (max(nchar(barcodeseqs)) != barcodelength))
+ stop(paste("Barcode sequence length is set to ", barcodelength, ", there are barcode sequence not with specified length.\n", sep=""))
+ if (IsPairedReads) {
+ barcodeseqRevIndex = which(colnames(barcodes) == 'SequencesReverse')
+ if (length(barcodeseqRevIndex) < 1)
+ stop("Can't find column SequencesReverse in ", barcodefile)
+ barcodeseqsReverse <- as.character(barcodes[, barcodeseqRevIndex])
+ barcodelengthReverse <- barcodeEndRev - barcodeStartRev + 1;
+ if ((min(nchar(barcodeseqsReverse)) != barcodelengthReverse) || (max(nchar(barcodeseqsReverse)) != barcodelengthReverse))
+ stop(paste("Reverse barcode sequence length is set to ", barcodelength, ", there are reverse barcode sequence not in specified length.\n", sep=""))
+ concatenatedBarcodeseqs = paste(barcodeseqs, barcodeseqsReverse, sep="")
+ if (length(unique(concatenatedBarcodeseqs)) != numbc)
+ stop("There are duplicate forward/reverse barcode sequences.\n")
+ } else {
+ if (length(unique(barcodeseqs)) != numbc)
+ stop("There are duplicate barcode sequences.\n")
+ }
+
+ # Validate hairpins
+ hairpinlength <- hairpinEnd - hairpinStart + 1;
+ hairpins <- read.table(hairpinfile, header=TRUE, sep="\t");
+ numhp <- nrow(hairpins)
+
+ hairpinIDIndex = which(colnames(hairpins) == 'ID')
+ if (length(hairpinIDIndex) < 1)
+ stop("Can't find column ID in ", hairpinfile)
+ hairpinIDs <- as.character(hairpins[, hairpinIDIndex])
+ hairpinSeqIndex = which(colnames(hairpins) == 'Sequences')
+ if (length(hairpinSeqIndex) < 1)
+ stop("Can't find column Sequences in ", hairpinfile)
+ hairpinseqs <- as.character(hairpins[, hairpinSeqIndex])
+
+ if ((min(nchar(hairpinseqs)) != hairpinlength) || (max(nchar(hairpinseqs)) != hairpinlength))
+ stop(paste("Hairpin sequence length is set to ", hairpinlength, ", there are hairpin sequences not with specified length.\n", sep=""))
+ if (length(unique(hairpinseqs)) != numhp)
+ stop("There are duplicate hairpin sequences.\n")
+ if (length(unique(hairpinIDs)) != numhp)
+ stop("There are duplicate hairpin IDs.\n")
+
+ # validate mismatch/shifting input parameters
+ if (allowShifting) {
+ if ((shiftingBase <= 0) || (shiftingBase > 5))
+ stop("To allow hairpin matching at a shifted position, please input a positive shiftingBase no greater than 5. ")
+ }
+ if (allowMismatch) {
+ if ((barcodeMismatchBase < 0) || (barcodeMismatchBase > 2))
+ stop("To allow mismatch in barcode sequence, please input a non-negative barcodeMismatchBase no greater than than 2. ")
+ if ((hairpinMismatchBase < 0) || (hairpinMismatchBase > 4))
+ stop("To allow mismatch in hairpin sequence, please input a non-negative hairpinMismatchBase no greater than than 4. ")
+ }
+ if (allowShiftedMismatch) {
+ if ((!allowShifting) || (!allowMismatch)){
+ stop("allowShiftedMismatch option can only be turned on when allowShiting and allowMismatch are both TRUE. ")
+ }
+ }
+
+ # passing only barcode/hairpin sequences to C function
+ tempbarcodefile <- paste("Barcode", as.character(Sys.getpid()), "temp.txt", sep = "_")
+ if (IsPairedReads) {
+ bothBarcodeSeqs = cbind(barcodeseqs, barcodeseqsReverse)
+ write.table(bothBarcodeSeqs, file=tempbarcodefile, sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE);
+ } else {
+ write.table(barcodeseqs, file=tempbarcodefile, sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE);
+ }
+
+ temphairpinfile <- paste("Hairpin", as.character(Sys.getpid()), "temp.txt", sep = "_")
+ write.table(hairpinseqs, file=temphairpinfile, sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE);
+
+ tempoutfile <- paste("ReadcountSummary", as.character(Sys.getpid()), "output.txt", sep = "_")
+
+ tryCatch({
+ if (!IsPairedReads) {
+ readfile2 = "DummyReadfile.fastq"
+ barcodeStartRev = 0;
+ barcodeEndRev = 0;
+ }
+ .C(.cprocessHairpinReads, IsPairedReads, readfile, readfile2, as.integer(length(readfile)),
+ as.character(tempbarcodefile), as.character(temphairpinfile),
+ as.integer(barcodeStart), as.integer(barcodeEnd), as.integer(barcodeStartRev), as.integer(barcodeEndRev), as.integer(hairpinStart), as.integer(hairpinEnd),
+ as.integer(allowShifting), as.integer(shiftingBase),
+ as.integer(allowMismatch), as.integer(barcodeMismatchBase), as.integer(hairpinMismatchBase),
+ as.integer(allowShiftedMismatch),
+ as.character(tempoutfile), as.integer(verbose))
+
+ hairpinReadsSummary <- read.table(tempoutfile, sep="\t", header=FALSE)
+ },
+ error = function(err) {
+ print(paste("ERROR MESSAGE: ",err))
+ },
+ finally = {
+ if (file.exists(tempbarcodefile))
+ file.remove(tempbarcodefile)
+ if (file.exists(temphairpinfile))
+ file.remove(temphairpinfile)
+ if (file.exists(tempoutfile))
+ file.remove(tempoutfile)
+ }
+ )
+
+ if (exists("hairpinReadsSummary")) {
+
+ if (nrow(hairpinReadsSummary) != length(hairpinIDs))
+ stop("Number of hairpins from result count matrix doesn't agree with given hairpin list. ")
+ if (ncol(hairpinReadsSummary) != length(barcodeIDs))
+ stop("Number of barcodes from result count matrix doesn't agree with given barcode list. ")
+ colnames(hairpinReadsSummary) = barcodeIDs
+ rownames(hairpinReadsSummary) = hairpinIDs
+ x <- DGEList(counts = hairpinReadsSummary, genes = hairpins)
+ if(!is.null(barcodes$group)) {
+ x$samples = cbind("ID"=barcodes$ID, "lib.size"=x$samples$lib.size,
+ "norm.factors"=x$samples$norm.factors,
+ barcodes[,-match(c("ID","Sequences"), colnames(barcodes))])
+ } else {
+ x$samples = cbind("ID"=barcodes$ID, x$samples, barcodes[,-match(c("ID","Sequences"), colnames(barcodes))])
+ }
+ } else {
+ stop("An error occured in processHairpinReads.")
+ }
+ x
+}
diff --git a/R/processHairpinReads.R b/R/processHairpinReads.R
deleted file mode 100644
index aa19562..0000000
--- a/R/processHairpinReads.R
+++ /dev/null
@@ -1,124 +0,0 @@
-# Code to process hairpin reads from Illumina sequencer
-# Assume fixed structure of read:
-# Barcode + Common sequence + Hairpin sequence
-
-processHairpinReads = function(readfile, barcodefile, hairpinfile,
- barcodeStart=1, barcodeEnd=5, hairpinStart=37, hairpinEnd=57,
- allowShifting=FALSE, shiftingBase = 3,
- allowMismatch=FALSE, barcodeMismatchBase = 1, hairpinMismatchBase = 2,
- allowShiftedMismatch = FALSE,
- verbose = FALSE) {
-
- # Check file existence
- if ((length(readfile) == 1) && (!file.exists(readfile)))
- stop("Read file doesn't exist.\n")
- if (length(readfile) > 1){
- for(i in 1:length(readfile)){
- if (!file.exists(readfile[i]))
- stop(paste("Read file ", readfile[i], " doesn't exist. \n", sep=""))
- }
- }
- if (!file.exists(barcodefile))
- stop("Barcode file doesn't exist.\n")
- if (!file.exists(hairpinfile))
- stop("Hairpin file doesn't exist.\n")
-
- # Validating params
- reads <- file(readfile[1], "rt");
- first_read <- readLines(reads, 2)
- readlength <- nchar(first_read[2])
-
- if (barcodeStart > barcodeEnd)
- stop("Barcode start position is greater than barcode end position.\n")
- if ((barcodeStart < 1) || (barcodeStart > readlength))
- stop("Invalid barcode start position!\n")
- if ((barcodeEnd < 1) || (barcodeEnd > readlength))
- stop("Invalid barcode end position!\n")
- if (barcodeEnd <= barcodeStart)
- stop("Barcode end position should be greater than barcode start position. \n")
- if ((hairpinStart < 1) || (hairpinStart > readlength))
- stop("Invalid hairpin start position!")
- if ((hairpinEnd < 1) || (hairpinEnd > readlength))
- stop("Invalid hairpin end position!")
- if (hairpinEnd <= hairpinStart)
- stop("Hairpin end position should be greater than hairpin start position. \n")
-
- # check that barcodes and hairpins provided have no duplicates, are in specified length.
- barcodelength <- barcodeEnd - barcodeStart + 1;
- barcodes <- read.table(barcodefile, header=TRUE, sep="\t");
- numbc <- nrow(barcodes)
- barcodeseqs <- as.character(barcodes$Sequences) #[,2])
- barcodenames <- as.character(barcodes$ID) #[,1])
- if ((min(nchar(barcodeseqs)) != barcodelength) || (max(nchar(barcodeseqs)) != barcodelength))
- stop(paste("Barcode sequence length is set to ", barcodelength, ", there are barcode sequence not in specified length.\n", sep=""))
- if (length(unique(barcodeseqs)) != numbc)
- stop("There are duplicate barcode sequences.\n")
- if (length(unique(barcodenames)) != numbc)
- stop("There are duplicate barcode names.\n")
- tempbarcodefile <- paste("Barcode", as.character(Sys.getpid()), "temp.txt", sep = "_")
- # passing only barcode sequences to C function
- write.table(barcodeseqs, file=tempbarcodefile, sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE);
-
- hairpinlength <- hairpinEnd - hairpinStart + 1;
- hairpins <- read.table(hairpinfile, header=TRUE, sep="\t");
- numhp <- nrow(hairpins)
- hairpinseqs <- as.character(hairpins$Sequences) #[,2])
- hairpinnames <- as.character(hairpins$ID) #[,1])
- if ((min(nchar(hairpinseqs)) != hairpinlength) || (max(nchar(hairpinseqs)) != hairpinlength))
- stop(paste("Hairpin sequence length is set to ", hairpinlength, ", there are hairpin sequences not in specified length.\n", sep=""))
- if (length(unique(hairpinseqs)) != numhp)
- stop("There are duplicate hairpin sequences.\n")
- if (length(unique(hairpinnames)) != numhp)
- stop("There are duplicate hairpin names.\n")
-
- # passing only hairpin sequences to C function
- temphairpinfile <- paste("Hairpin", as.character(Sys.getpid()), "temp.txt", sep = "_")
- write.table(hairpinseqs, file=temphairpinfile, sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE);
-
- if (allowShifting) {
- if ((shiftingBase <= 0) || (shiftingBase > 5))
- stop("To allow hairpin matching at a shifted position, please input a positive shiftingBase no greater than 5. ")
- }
-
- if (allowMismatch) {
- if ((barcodeMismatchBase <= 0) || (barcodeMismatchBase > 2))
- stop("To allow mismatch in barcode sequence, please input a positive barcodeMismatchBase no greater than than 2. ")
- if ((hairpinMismatchBase <= 0) || (hairpinMismatchBase > 4))
- stop("To allow mismatch in hairpin sequence, please input a positive hairpinMismatchBase no greater than than 4. ")
- }
-
- if (allowShiftedMismatch) {
- if ((!allowShifting) || (!allowMismatch)){
- stop("allowShiftedMismatch option can only be turned on when allowShiting and allowMismatch are both TRUE. ")
- }
- }
- tempoutfile <- paste("ReadcountSummary", as.character(Sys.getpid()), "output.txt", sep = "_")
-
- .C("processHairpinReads", readfile, as.integer(length(readfile)), as.character(tempbarcodefile), as.character(temphairpinfile),
- as.integer(barcodeStart), as.integer(barcodeEnd), as.integer(hairpinStart), as.integer(hairpinEnd),
- as.integer(allowShifting), as.integer(shiftingBase),
- as.integer(allowMismatch), as.integer(barcodeMismatchBase), as.integer(hairpinMismatchBase),
- as.integer(allowShiftedMismatch),
- as.character(tempoutfile), as.integer(verbose))
-
- summary <- read.table(tempoutfile, sep="\t", header=FALSE)
- file.remove(tempoutfile)
- file.remove(tempbarcodefile)
- file.remove(temphairpinfile)
- close(reads)
- if (nrow(summary) != length(hairpinnames))
- stop("Number of hairpins from result count matrix doesn't agree with given hairpin list. ")
- if (ncol(summary) != length(barcodenames))
- stop("Number of barcodes from result count matrix doesn't agree with given barcode list. ")
- colnames(summary) = barcodenames
- rownames(summary) = hairpinnames
- x <- DGEList(counts = summary, genes = hairpins)
- if(!is.null(barcodes$group)) {
- x$samples = cbind("ID"=barcodes$ID, "lib.size"=x$samples$lib.size,
- "norm.factors"=x$samples$norm.factors,
- barcodes[,-match(c("ID","Sequences"), colnames(barcodes))])
- } else {
- x$samples = cbind("ID"=barcodes$ID, x$samples, barcodes[,-match(c("ID","Sequences"), colnames(barcodes))])
- }
- x
-}
diff --git a/R/readDGE.R b/R/readDGE.R
index 67af8c2..34e81ff 100644
--- a/R/readDGE.R
+++ b/R/readDGE.R
@@ -1,6 +1,6 @@
readDGE <- function(files,path=NULL,columns=c(1,2),group=NULL,labels=NULL,...)
-# Read and collate a set of DGE data files, one library per file
-# Last modified 16 October 2010.
+# Read and collate a set of count data files, each file containing counts for one library
+# Created 2006. Last modified 15 August 2014.
{
x <- list()
if(is.data.frame(files)) {
diff --git a/R/residDF.R b/R/residDF.R
index 282fd48..d458a60 100644
--- a/R/residDF.R
+++ b/R/residDF.R
@@ -1,5 +1,7 @@
.residDF <- function(zero, design)
-# 6 Jan 2014
+# Effective residual degrees of freedom after adjusting for exact zeros
+# Gordon Smyth and Aaron Lun
+# Created 6 Jan 2014. Last modified 2 Sep 2014
{
nlib <- ncol(zero)
ncoef <- ncol(design)
@@ -16,7 +18,7 @@
if(any(somezero)) {
zero2 <- zero[somezero,,drop=FALSE]
-# Integer packing will only work for 31 libraries at a time.
+# Integer packing will only work for 31 libraries at a time.
assembly <- list()
collected <- 0L
step <- 31L
@@ -28,7 +30,7 @@
collected <- collected + step
}
-# Figuring out the unique components.
+# Figuring out the unique components.
o <- do.call(order, assembly)
nzeros <- sum(somezero)
is.different <- logical(nzeros)
@@ -38,7 +40,7 @@
first.of.each <- which(is.different)
last.of.each <- c(first.of.each[-1]-1L, nzeros)
-# Identifying the true residual d.f. for each of these rows.
+# Identifying the true residual d.f. for each of these rows.
DF2 <- nlib-nzero[somezero]
for (u in 1:length(first.of.each)) {
i <- o[first.of.each[u]:last.of.each[u]]
diff --git a/R/subsetting.R b/R/subsetting.R
index 5122ecb..734143a 100644
--- a/R/subsetting.R
+++ b/R/subsetting.R
@@ -21,13 +21,13 @@ function(object, i, j, keep.lib.sizes=TRUE)
assign("[.DGEGLM",
function(object, i, j)
# Subsetting for DGEGLM objects
-# 11 May 2011. Last modified 11 Dec 2013.
+# 11 May 2011. Last modified 09 May 2014.
{
if(!missing(j)) stop("Subsetting columns not allowed for DGEGLM object.",call.=FALSE)
# Recognized components
IJ <- character(0)
- IX <- c("counts","offset","weights","genes","coefficients","fitted.values")
+ IX <- c("counts","offset","weights","genes","coefficients","fitted.values","unshrunk.coefficients")
I <- c("AveLogCPM","trended.dispersion","tagwise.dispersion","prior.n","prior.df","dispersion","df.residual","deviance","iter","failed")
JX <- c("samples","design")
@@ -53,13 +53,13 @@ function(object, i, j)
assign("[.DGELRT",
function(object, i, j)
# Subsetting for DGELRT objects
-# 6 April 2011. Last modified 11 Dec 2013.
+# 6 April 2011. Last modified 09 May 2014.
{
if(!missing(j)) stop("Subsetting columns not allowed for DGELRT object.",call.=FALSE)
# Recognized components
IJ <- character(0)
- IX <- c("counts","offset","weights","genes","coefficients","fitted.values","table")
+ IX <- c("counts","offset","weights","genes","coefficients","fitted.values","table","unshrunk.coefficients")
I <- c("AveLogCPM","trended.dispersion","tagwise.dispersion","prior.n","prior.df","dispersion","df.residual","deviance","iter","failed","df.test")
JX <- character(0)
diff --git a/R/treatDGE.R b/R/treatDGE.R
new file mode 100644
index 0000000..36cdf89
--- /dev/null
+++ b/R/treatDGE.R
@@ -0,0 +1,96 @@
+treatDGE <- function(glmfit, coef=ncol(glmfit$design), contrast=NULL, lfc=0)
+# Likelihood ratio test with threshold
+# Yunshun Chen and Gordon Smyth
+# 05 May 2014.
+{
+ if(lfc < 0) stop("lfc has to be non-negative")
+# Check glmfit
+ if(!is(glmfit,"DGEGLM")) {
+ if(is(glmfit,"DGEList") && is(coef,"DGEGLM")) {
+ stop("First argument is no longer required. Rerun with just the glmfit and coef/contrast arguments.")
+ }
+ stop("glmfit must be an DGEGLM object (usually produced by glmFit).")
+ }
+ if(is.null(glmfit$AveLogCPM)) glmfit$AveLogCPM <- aveLogCPM(glmfit)
+ nlibs <- ncol(glmfit)
+ ngenes <- nrow(glmfit)
+
+# Check design matrix
+ design <- as.matrix(glmfit$design)
+ nbeta <- ncol(design)
+ if(nbeta < 2) stop("Need at least two columns for design, usually the first is the intercept column")
+ coef.names <- colnames(design)
+
+ if(glmfit$prior.count!=0){
+ coefficients.mle <- glmfit$unshrunk.coefficients
+ } else {
+ coefficients.mle <- glmfit$coefficients
+ }
+
+# Evaluate logFC for coef to be tested
+# Note that contrast takes precedence over coef: if contrast is given
+# then reform design matrix so that contrast of interest is the first column
+ if(is.null(contrast)) {
+ if(length(coef) > 1) coef <- unique(coef)
+ if(is.character(coef)) {
+ check.coef <- coef %in% colnames(design)
+ if(any(!check.coef)) stop("One or more named coef arguments do not match a column of the design matrix.")
+ coef.name <- coef
+ coef <- match(coef, colnames(design))
+ }
+ else
+ coef.name <- coef.names[coef]
+ logFC <- coefficients.mle[, coef, drop=FALSE]/log(2)
+ } else {
+ contrast <- as.matrix(contrast)
+ reform <- contrastAsCoef(design, contrast=contrast, first=TRUE)
+ coef <- 1
+ logFC <- drop((coefficients.mle %*% contrast)/log(2))
+ contrast <- drop(contrast)
+ i <- contrast!=0
+ coef.name <- paste(paste(contrast[i],coef.names[i],sep="*"),collapse=" ")
+ design <- reform$design
+ }
+ logFC <- as.vector(logFC)
+
+# Null design matrix
+ design0 <- design[, -coef, drop=FALSE]
+
+# LRT of beta_0 = zero
+ fit0.zero <- glmFit(glmfit$counts, design=design0, offset=glmfit$offset, weights=glmfit$weights, dispersion=glmfit$dispersion, prior.count=0)
+ X2.zero <- pmax(0, fit0.zero$deviance - glmfit$deviance)
+
+# LRT of beta_0 = tau
+ offset.adj <- matrix(-lfc*log(2), ngenes, 1)
+ up <- logFC >= 0
+ offset.adj[up, ] <- lfc*log(2)
+ offset.new <- glmfit$offset + offset.adj %*% t(design[, coef, drop=FALSE])
+ fit0.tau <- glmFit(glmfit$counts, design=design0, offset=offset.new, weights=glmfit$weights, dispersion=glmfit$dispersion, prior.count=0)
+ fit1.tau <- glmFit(glmfit$counts, design=design, offset=offset.new, weights=glmfit$weights, dispersion=glmfit$dispersion, prior.count=0)
+ X2.tau <- pmax(0, fit0.tau$deviance - fit1.tau$deviance)
+
+ z.zero <- sqrt(X2.zero)
+ z.tau <- sqrt(X2.tau)
+ within <- abs(logFC) <= lfc
+ sgn <- 2*within - 1
+ p.value <- pnorm( z.tau*sgn ) + pnorm( -z.tau*sgn-2*z.zero )
+
+ tab <- data.frame(
+ logFC=logFC,
+ logCPM=glmfit$AveLogCPM,
+ PValue=p.value,
+ row.names=rownames(glmfit)
+ )
+ glmfit$lfc <- lfc
+ glmfit$counts <- NULL
+ glmfit$table <- tab
+ glmfit$comparison <- coef.name
+ new("DGELRT",unclass(glmfit))
+}
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/inst/CITATION b/inst/CITATION
index e78f0d5..4daae0d 100644
--- a/inst/CITATION
+++ b/inst/CITATION
@@ -12,6 +12,18 @@ citEntry(
)
citEntry(
+ entry="article",
+ author = "McCarthy, Davis J. and Chen, Yunshun and Smyth, Gordon K.",
+ title = "Differential expression analysis of multifactor RNA-Seq experiments with respect to biological variation",
+ year = 2012,
+ journal = "Nucleic Acids Research",
+ volume = 40,
+ number = 10,
+ pages = 4288-4297,
+ textVersion = "McCarthy DJ, Chen Y and Smyth GK (2012). Differential expression analysis of multifactor RNA-Seq experiments with respect to biological variation. Nucleic Acids Research 40, 4288-4297"
+)
+
+citEntry(
entry="article",
title = "Moderated statistical tests for assessing differences in tag abundance",
author = "Mark D Robinson and Gordon K Smyth",
@@ -32,15 +44,3 @@ citEntry(
year = 2008,
textVersion = "Robinson MD and Smyth GK (2008). Small-sample estimation of negative binomial dispersion, with applications to SAGE data. Biostatistics, 9, 321-332"
)
-
-citEntry(
- entry="article",
- author = "McCarthy, Davis J. and Chen, Yunshun and Smyth, Gordon K.",
- title = "Differential expression analysis of multifactor RNA-Seq experiments with respect to biological variation",
- year = 2012,
- journal = "Nucleic Acids Research",
- volume = 40,
- number = 10,
- pages = 4288-4297,
- textVersion = "McCarthy DJ, Chen Y and Smyth GK (2012). Differential expression analysis of multifactor RNA-Seq experiments with respect to biological variation. Nucleic Acids Research 40, 4288-4297"
-)
diff --git a/inst/NEWS.Rd b/inst/NEWS.Rd
index 99db9db..3e4697c 100644
--- a/inst/NEWS.Rd
+++ b/inst/NEWS.Rd
@@ -2,14 +2,43 @@
\title{edgeR News}
\encoding{UTF-8}
-\section{Version 3.6.3}{\itemize{
+\section{Version 3.8.0}{\itemize{
+
\item
-The SAGE datasets from Zhang et al (1997) are no longer included with the edgeR package.
+New goana() methods for DGEExact and DGELRT objects to perform Gene Ontology analysis of differentially expressed genes using Entrez Gene IDs.
+
+\item
+New functions diffSpliceDGE(), topSpliceDGE() and plotSpliceDGE() for detecting differential exon usage and displaying results.
+
+\item
+New function treatDGE() that tests for DE relative to a specified log2-FC threshold.
\item
-A variety of minor code improvements and bug fixes have been implemented.
+glmQLFTest() is split into three functions: glmQLFit() for fitting quasi-likelihood GLMs, glmQLFTest() for performing quasi-likelihood F-tests and plotQLDisp() for plotting quasi-likelihood dispersions.
+
+\item
+processHairpinReads() renamed to processAmplicons() and allows for paired end data.
+
+\item
+glmFit() now stores unshrunk.coefficients from prior.count=0 as well as shrunk coefficients.
+
+\item
+estimateDisp() now has a min.row.sum argument to protect against all zero counts.
+
+\item
+APL calculations in estimateDisp() are hot-started using fitted values from previous dispersions, to avoid discontinuous APL landscapes.
+
+\item
+adjustedProfileLik() is modified to accept starting coefficients. glmFit() now passes starting coefficients to mglmOneGroup().
+
+\item
+calcNormFactors() is now a S3 generic function.
+
+\item
+The SAGE datasets from Zhang et al (1997) are no longer included with the edgeR package.
}}
+
\section{Version 3.6.0}{\itemize{
\item
diff --git a/inst/doc/edgeR.Rnw b/inst/doc/edgeR.Rnw
index 34e205e..af5781e 100755
--- a/inst/doc/edgeR.Rnw
+++ b/inst/doc/edgeR.Rnw
@@ -14,10 +14,10 @@
\begin{document}
\title{edgeR Package Introduction}
-\author{Mark Robinson, Davis McCarthy, Yunshun Chen,\\
-Aaron Lun, Gordon K.\ Smyth}
+\author{Yunshun Chen, Davis McCarthy, Aaron Lun,\\
+Xiaobei Zhou, Mark Robinson, Gordon K.\ Smyth}
\date{10 October 2012\\
-Revised 10 November 2013}
+Revised 8 October 2014}
\maketitle
diff --git a/inst/doc/edgeR.pdf b/inst/doc/edgeR.pdf
index 607b303..a20e249 100644
Binary files a/inst/doc/edgeR.pdf and b/inst/doc/edgeR.pdf differ
diff --git a/man/adjustedProfileLik.Rd b/man/adjustedProfileLik.Rd
index 6f81865..2591068 100644
--- a/man/adjustedProfileLik.Rd
+++ b/man/adjustedProfileLik.Rd
@@ -8,7 +8,8 @@ Compute adjusted profile-likelihoods for estimating the dispersion parameters of
}
\usage{
-adjustedProfileLik(dispersion, y, design, offset, weights=NULL, adjust=TRUE)
+adjustedProfileLik(dispersion, y, design, offset, weights=NULL, adjust=TRUE,
+ start=NULL, get.coef=FALSE)
}
\arguments{
@@ -16,12 +17,16 @@ adjustedProfileLik(dispersion, y, design, offset, weights=NULL, adjust=TRUE)
\item{y}{numeric matrix of counts.}
\item{design}{numeric matrix giving the design matrix.}
\item{offset}{numeric matrix of same size as \code{y} giving offsets for the log-linear models. Can be a scalor or a vector of length \code{ncol(y)}, in which case it is expanded out to a matrix.}
-\item{weights}{optional numeric matrix giving observation weights}
+\item{weights}{optional numeric matrix giving observation weights.}
\item{adjust}{logical, if \code{TRUE} then Cox-Reid adjustment is made to the log-likelihood, if \code{FALSE} then the log-likelihood is returned without adjustment.}
+\item{start}{numeric matrix of starting values for the GLM coefficients, to be passed to \code{\link{glmFit}}.}
+\item{get.coef}{logical, specifying whether fitted GLM coefficients should be returned.}
}
\value{
-vector of adjusted profile log-likelihood values, one for each row of \code{y}.
+If \code{get.coef==FALSE}, a vector of adjusted profile log-likelihood values is returned containing one element for each row of \code{y}.
+
+Otherwise, a list is returned containing \code{apl}, the aforementioned vector of adjusted profile likelihoods; and \code{beta}, a numeric matrix of fitted GLM coefficients.
}
\details{
@@ -33,6 +38,9 @@ The conditional likelihood approach is a technique for adjusting the likelihood
When estimating the dispersion, the nuisance parameters are the coefficients in the linear model.
This implementation calls the LAPACK library to perform the Cholesky decomposition during adjustment estimation.
+
+The purpose of \code{start} and \code{get.coef} is to allow hot-starting for multiple calls to \code{adjustedProfileLik}, when only the \code{dispersion} is altered.
+Specifically, the returned GLM coefficients from one call with \code{get.coef==TRUE} can be used as the \code{start} values for the next call.
}
\references{
diff --git a/man/aveLogCPM.Rd b/man/aveLogCPM.Rd
index 34251c6..b9e2a95 100644
--- a/man/aveLogCPM.Rd
+++ b/man/aveLogCPM.Rd
@@ -33,7 +33,7 @@ An average value of \code{prior.count} is added to the counts before running \co
This function is similar to
-\code{rowMeans(cpm(y, log=TRUE, \dots))},
+\code{log2(rowMeans(cpm(y, \dots)))},
but with the refinement that larger library sizes are given more weight in the average.
The two versions will agree for large values of the dispersion.
diff --git a/man/calcNormFactors.Rd b/man/calcNormFactors.Rd
index b0cd255..9d17f19 100644
--- a/man/calcNormFactors.Rd
+++ b/man/calcNormFactors.Rd
@@ -1,15 +1,25 @@
\name{calcNormFactors}
\alias{calcNormFactors}
+\alias{calcNormFactors.DGEList}
+\alias{calcNormFactors.default}
+
\title{Calculate Normalization Factors to Align Columns of a Count Matrix}
\description{
Calculate normalization factors to scale the raw library sizes.
}
\usage{
-calcNormFactors(object, method=c("TMM","RLE","upperquartile","none"), refColumn = NULL,
- logratioTrim = .3, sumTrim = 0.05, doWeighting=TRUE, Acutoff=-1e10, p=0.75)
+\method{calcNormFactors}{DGEList}(object, method=c("TMM","RLE","upperquartile","none"),
+ refColumn=NULL, logratioTrim=.3, sumTrim=0.05, doWeighting=TRUE,
+ Acutoff=-1e10, p=0.75, \dots)
+\method{calcNormFactors}{default}(object, lib.size=NULL, method=c("TMM","RLE",
+ "upperquartile","none"), refColumn=NULL, logratioTrim=.3,
+ sumTrim=0.05, doWeighting=TRUE, Acutoff=-1e10, p=0.75, \dots)
}
+
+
\arguments{
\item{object}{either a \code{matrix} of raw (read) counts or a \code{DGEList} object}
+ \item{lib.size}{numeric vector of library sizes of the \code{object}.}
\item{method}{normalization method to be used}
\item{refColumn}{column to use as reference for \code{method="TMM"}. Can be a column number or a numeric vector of length \code{nrow(object))}.}
\item{logratioTrim}{amount of trim to use on log-ratios ("M" values) for \code{method="TMM"}}
@@ -17,6 +27,7 @@ calcNormFactors(object, method=c("TMM","RLE","upperquartile","none"), refColumn
\item{doWeighting}{logical, whether to compute (asymptotic binomial precision) weights for \code{method="TMM"}}
\item{Acutoff}{cutoff on "A" values to use before trimming for \code{method="TMM"}}
\item{p}{percentile (between 0 and 1) of the counts that is aligned when \code{method="upperquartile"}}
+ \item{\dots}{further arguments that are not currently used.}
}
\details{
diff --git a/man/diffSpliceDGE.Rd b/man/diffSpliceDGE.Rd
new file mode 100644
index 0000000..0bd4376
--- /dev/null
+++ b/man/diffSpliceDGE.Rd
@@ -0,0 +1,71 @@
+\name{diffSpliceDGE}
+\alias{diffSpliceDGE}
+
+\title{Test for Differential Exon Usage}
+\description{Given a negative binomial generalized log-linear model fit at the exon level, test for differential exon usage between experimental conditions.}
+\usage{
+diffSpliceDGE(fit.exon, coef=ncol(fit.exon$design), geneid, exonid=NULL, verbose=TRUE)
+}
+
+\arguments{
+ \item{fit.exon}{an \code{DGEGLM} fitted model object produced by \code{glmFit}. Rows should correspond to exons.}
+ \item{coef}{integer indicating which coefficient of the generalized linear model is to be tested for differential exon usage. Defaults to the last coefficient.}
+ \item{geneid}{gene identifiers. Either a vector of length \code{nrow(fit.exon)} or the name of the column of \code{fit.exon$genes} containing the gene identifiers. Rows with the same ID are assumed to belong to the same gene.}
+ \item{exonid}{exon identifiers. Either a vector of length \code{nrow(fit.exon)} or the name of the column of \code{fit.exon$genes} containing the exon identifiers.}
+ \item{verbose}{logical, if \code{TRUE} some diagnostic information about the number of genes and exons is output.}
+}
+
+\value{
+\code{diffSpliceDGE} produces an object of class \code{DGELRT} containing the component \code{design} from \code{fit.exon} plus the following new components:
+ \item{comparison}{character string describing the coefficient being tested.}
+ \item{coefficients}{numeric vector of coefficients on the natural log scale. Each coefficient is the difference between the log-fold-change for that exon versus the average log-fold-change for the rest exons within the same gene.}
+ \item{exon.F}{numeric vector of F-statistics for exons.}
+ \item{exon.df.test}{numeric vector of testing degrees of freedom for exons.}
+ \item{exon.df.prior}{numeric vector of prior degrees of freedom for exons.}
+ \item{exon.df.residual}{numeric vector of residual degrees of freedom for exons.}
+ \item{exon.p.value}{numeric vector of p-values for exons.}
+ \item{genes}{data.frame of exon annotation}
+ \item{genecolname}{character string giving the name of the column of \code{genes} containing gene IDs.}
+ \item{exoncolname}{character string giving the name of the column of \code{genes} containing exon IDs.}
+ \item{gene.df.test}{numeric vector of testing degrees of freedom for genes.}
+ \item{gene.df.prior}{numeric vector of prior degrees of freedom for genes.}
+ \item{gene.df.residual}{numeric vector of residual degrees of freedom for genes.}
+ \item{gene.Simes.p.value}{numeric vector of Simes' p-values for genes.}
+ \item{gene.F}{numeric vector of F-statistics for gene-level test.}
+ \item{gene.F.p.value}{numeric vector of F-test p-values for genes.}
+ \item{gene.genes}{data.frame of gene annotation.}
+The information and testing results for both exons and genes are sorted by geneid and by exonid within gene.
+}
+
+\details{
+This function tests for differential exon usage for each gene for a given coefficient of the generalized linear model.
+
+Testing for differential exon usage is equivalent to testing whether the exons in each gene have the same log-fold changes as the other exons in the same gene.
+At exon-level, each exon is compared to the average of all other exons for the same gene using quasi-likelihood F-tests.
+At gene-level, two different tests are provided. The first is converting exon-level p-values to gene-level p-values by Simes method.
+The other is an F-test for differences between the exon log-fold-changes within each gene.
+}
+
+\author{Yunshun Chen and Gordon Smyth}
+
+\examples{
+# Gene exon annotation
+Gene <- paste("G", 1:10, sep="")
+Gene <- rep(Gene, each=10)
+Exon <- paste("Ex", 1:10, sep="")
+Gene.Exon <- paste(Gene, Exon, sep=".")
+genes <- data.frame(GeneID=Gene, Gene.Exon=Gene.Exon)
+
+design <- model.matrix(~c(0,0,0,1,1,1))
+mu <- matrix(20, 100, 6)
+mu[1,4:6] <- 200
+counts <- matrix(rnbinom(600,mu=mu,size=20),100,6)
+
+y <- DGEList(counts=counts, lib.size=rep(1e6,6), genes=genes)
+gfit <- glmFit(y, design, dispersion=0.05)
+
+ds <- diffSpliceDGE(gfit, geneid="GeneID")
+topSpliceDGE(ds)
+plotSpliceDGE(ds)
+}
+
diff --git a/man/equalizeLibSizes.Rd b/man/equalizeLibSizes.Rd
index 0280c5d..7b03a95 100644
--- a/man/equalizeLibSizes.Rd
+++ b/man/equalizeLibSizes.Rd
@@ -6,13 +6,14 @@
\description{Adjusts counts so that the effective library sizes are equal, preserving fold-changes between groups and preserving biological variability within each group.}
\usage{
-equalizeLibSizes(object, dispersion=0, common.lib.size)
+equalizeLibSizes(object, dispersion=NULL, common.lib.size)
}
\arguments{
\item{object}{\code{\link[edgeR:DGEList-class]{DGEList}} object}
-\item{dispersion}{numeric scalar or vector of \code{dispersion} parameters; if a scalar, then a common dispersion parameter is used for all tags}
+\item{dispersion}{numeric vector of dispersion parameters.
+By default, is extracted from \code{object} or, if \code{object} contains no dispersion information, is set to \code{0.05}.}
\item{common.lib.size}{numeric scalar, the library size to normalize to; default is the geometric mean of the original effective library sizes}
}
@@ -26,16 +27,25 @@ equalizeLibSizes(object, dispersion=0, common.lib.size)
Thus function implements the quantile-quantile normalization method of Robinson and Smyth (2008).
It computes normalized counts, or pseudo-counts, used by \code{exactTest} and \code{estimateCommonDisp}.
-Note that the output common library size is a theoretical quantity.
-The column sums of the normalized counts, while to be exactly equal, nor are they intended to be.
-However, the expected counts for each tag are equal under the null hypothesis of no differential expression.
+The output pseudo-counts are the counts that would have theoretically arisen had the effective library sizes been equal for all samples.
+The pseudo-counts are computed in such as way as to preserve fold-change differences beween the groups defined by \code{object$samples$group} as well as biological variability within each group.
+Consequently, the results will depend on how the groups are defined.
+
+Note that the column sums of the \code{pseudo.counts} matrix will not generally be equal, because the effective library sizes are not necessarily the same as actual library sizes and because the normalized pseudo counts are not equal to expected counts.
+}
+
+\note{
+This function is intended mainly for internal edgeR use.
+It is not normally called directly by users.
}
\author{Mark Robinson, Davis McCarthy, Gordon Smyth}
\references{
-Robinson MD and Smyth GK (2008). Small-sample estimation of negative binomial dispersion, with applications to SAGE data.
+Robinson MD and Smyth GK (2008).
+Small-sample estimation of negative binomial dispersion, with applications to SAGE data.
\emph{Biostatistics}, 9, 321-332.
+\url{http://biostatistics.oxfordjournals.org/content/9/2/321}
}
\seealso{
diff --git a/man/estimateDisp.Rd b/man/estimateDisp.Rd
index 1f5ef0e..de54828 100644
--- a/man/estimateDisp.Rd
+++ b/man/estimateDisp.Rd
@@ -8,8 +8,8 @@ Maximizes the negative binomial likelihood to give the estimate of the common, t
}
\usage{
-estimateDisp(y, design=NULL, prior.df=NULL, trend.method="locfit", span=NULL,
- grid.length=21, grid.range=c(-10,10), robust=FALSE,
+estimateDisp(y, design=NULL, prior.df=NULL, trend.method="locfit", span=NULL,
+ min.row.sum=5, grid.length=21, grid.range=c(-10,10), robust=FALSE,
winsor.tail.p=c(0.05,0.1), tol=1e-06)
}
@@ -24,6 +24,8 @@ estimateDisp(y, design=NULL, prior.df=NULL, trend.method="locfit", span=NULL,
\item{span}{width of the smoothing window, as a proportion of the data set.}
+\item{min.row.sum}{numeric scalar giving a value for the filtering out of low abundance tags. Only tags with total sum of counts above this value are used. Low abundance tags can adversely affect the dispersion estimation, so this argument allows the user to select an appropriate filter threshold for the tag abundance.}
+
\item{grid.length}{the number of points on which the interpolation is applied for each tag.}
\item{grid.range}{the range of the grid points around the trend on a log2 scale.}
diff --git a/man/glmQLFTest.Rd b/man/glmQLFTest.Rd
new file mode 100644
index 0000000..3effb29
--- /dev/null
+++ b/man/glmQLFTest.Rd
@@ -0,0 +1,115 @@
+\name{glmQLFit}
+\alias{glmQLFit}
+\alias{glmQLFTest}
+
+\title{Quasi-likelihood methods with empirical Bayes shrinkage}
+
+\description{Fit a quasi-likelihood negative binomial generalized log-linear model to count data.
+Conduct genewise statistical tests for a given coefficient or coefficient contrast.}
+
+\usage{
+glmQLFit(y, design=NULL, dispersion=NULL, abundance.trend=TRUE, robust=FALSE, winsor.tail.p=c(0.05, 0.1), \dots)
+glmQLFTest(glmfit, coef=ncol(glmfit$design), contrast=NULL)
+}
+
+\arguments{
+\item{y}{a \code{DGEList} object containing count and sample data.}
+
+\item{design}{numeric matrix giving the design matrix for the tagwise linear models.}
+
+\item{dispersion}{numeric scalar or vector of negative binomial dispersions. Defaults to the trended dispersion, or the common dispersion (if no trend is available), or a value of 0.05 (if no common value is available).}
+
+\item{abundance.trend}{logical, whether to allow an abundance-dependent trend when estimating the prior values for the quasi-likelihood multiplicative dispersion parameter.}
+
+\item{robust}{logical, whether to estimate the prior degrees of freedom robustly.}
+
+\item{winsor.tail.p}{numeric vector of length 2 giving proportion to trim (Winsorize) from lower and upper tail of the distribution of genewise deviances when estimating the hyperparameters. Positive values produce robust empirical Bayes ignoring outlier small or large deviances. Only used when \code{robust=TRUE}.}
+
+\item{\dots}{other arguments are passed to \code{\link{glmFit}}.}
+
+\item{glmfit}{a \code{DGEGLM} object, usually output from \code{qlmQLFit}.}
+
+\item{coef}{integer or character vector indicating which coefficients of the linear model are to be tested equal to zero.}
+
+\item{contrast}{numeric vector or matrix specifying one or more contrasts of the linear model coefficients to be tested equal to zero.}
+}
+
+\value{
+\code{glmQLFit} produces an object of class \code{DGEGLM} with the same components as that produced by \code{\link{glmFit}}, plus:
+ \item{df.residual}{a numeric vector containing the number of residual degrees of freedom for the GLM fit of each gene.}
+ \item{s2.fit}{a list containing \code{df.prior}, the prior degrees of fredom; and \code{var.prior}, the location of the prior distribution. Both are numeric vectors if \code{abundance.trend=TRUE} and scalars otherwise. \code{var.post} is a numeric vector containing the shrunk quasi-likelihood dispersion for each gene.}
+ \item{df.prior}{a numeric vector or scalar containing the prior degrees of freedom, same as that in \code{s2.fit}.}
+
+\code{glmQFTest} produces objects of class \code{DGELRT} with the same components as for \code{glmfit} plus the following:
+ \item{table}{data frame with the same rows as \code{y} containing the log2-fold changes, F-statistics and p-values, ready to be displayed by \code{topTags.}.}
+ \item{comparison}{character string describing the coefficient or the contrast being tested.}
+
+The data frame \code{table} contains the following columns:
+ \item{logFC}{log2-fold change of expression between conditions being tested.}
+ \item{logCPM}{average log2-counts per million, the average taken over all libraries in \code{y}.}
+ \item{F}{F-statistics.}
+ \item{PValue}{p-values.}
+}
+
+\details{
+\code{glmQLFTest} implements the quasi-likelihood method of Lund et al (2012).
+It behaves the same as \code{glmLRT} except that it replaces likelihood ratio tests with quasi-likelihood F-tests for coefficients in the linear model.
+This function calls the limma function \code{\link{squeezeVar}} to conduct empirical Bayes smoothing of the genewise multiplicative dispersions.
+Note that the \code{QuasiSeq} package provides a alternative implementation of Lund et al (2012), with slightly different glm, trend and FDR methods.
+
+There are a number of subtleties involved in the use of QL models.
+The first is that the negative binomial dispersions \emph{must} be trended or common values.
+This is because the function assumes that the supplied values are the true values.
+For the trended/common values, the assumption is reasonable as information from many genes improves precision.
+This is not the case for the tagwise dispersions due to the limited information for each gene.
+
+Another subtlety involves the handling of zero counts.
+Observations with fitted values of zero provide no residual degrees of freedom.
+This must be considered when computing the value of the quasi-likelihood dispersion for genes with many zeros.
+Finally, a lower bound is defined for the p-value of each gene, based on the likelihood ratio test.
+This avoids spurious results involving weak shrinkage with very low quasi-likelihood dispersions.
+}
+
+\references{
+Lund, SP, Nettleton, D, McCarthy, DJ, and Smyth, GK (2012).
+Detecting differential expression in RNA-sequence data using quasi-likelihood with shrunken dispersion estimates.
+\emph{Statistical Applications in Genetics and Molecular Biology} Volume 11, Issue 5, Article 8.
+\url{http://www.statsci.org/smyth/pubs/QuasiSeqPreprint.pdf}
+}
+
+\author{Davis McCarthy and Gordon Smyth, with modifications by Aaron Lun}
+
+\examples{
+nlibs <- 4
+ntags <- 1000
+dispersion.true <- 1/rchisq(ntags, df=10)
+design <- model.matrix(~factor(c(1,1,2,2)))
+
+# Generate count data
+y <- rnbinom(ntags*nlibs,mu=20,size=1/dispersion.true)
+y <- matrix(y,ntags,nlibs)
+d <- DGEList(y)
+d <- calcNormFactors(d)
+
+# Fit the NB GLMs with QL methods
+d <- estimateDisp(d, design)
+fit <- glmQLFit(d, design)
+results <- glmQLFTest(fit)
+topTags(results)
+fit <- glmQLFit(d, design, robust=TRUE)
+results <- glmQLFTest(fit)
+topTags(results)
+fit <- glmQLFit(d, design, abundance.trend=FALSE)
+results <- glmQLFTest(fit)
+topTags(results)
+}
+
+\seealso{
+\code{\link{topTags}} displays results from \code{glmQLFTest}.
+
+\code{\link{plotQLDisp}} can be used to visualize the distribution of QL dispersions after EB shrinkage from \code{glmQLFit}.
+
+The \code{QuasiSeq} package gives an alternative implementation of \code{glmQLFTest} based on the same statistical ideas.
+}
+
+\keyword{models}
diff --git a/man/glmfit.Rd b/man/glmfit.Rd
index 4483ad3..48af6bf 100644
--- a/man/glmfit.Rd
+++ b/man/glmfit.Rd
@@ -3,9 +3,8 @@
\alias{glmFit.DGEList}
\alias{glmFit.default}
\alias{glmLRT}
-\alias{glmQLFTest}
-\title{Genewise Negative Binomial Generalized Linear Mdels}
+\title{Genewise Negative Binomial Generalized Linear Models}
\description{Fit a negative binomial generalized log-linear model to the read counts for each gene or transcript.
Conduct genewise statistical tests for a given coefficient or coefficient contrast.}
@@ -15,8 +14,6 @@ Conduct genewise statistical tests for a given coefficient or coefficient contra
\method{glmFit}{default}(y, design=NULL, dispersion=NULL, offset=NULL, lib.size=NULL, weights=NULL,
prior.count=0.125, start=NULL, \dots)
glmLRT(glmfit, coef=ncol(glmfit$design), contrast=NULL, test="chisq")
-glmQLFTest(y, design=NULL, dispersion=NULL, coef=ncol(glmfit$design), contrast=NULL,
- abundance.trend=TRUE, robust=FALSE, winsor.tail.p=c(0.05,0.1), plot=FALSE)
}
\arguments{
@@ -47,14 +44,6 @@ Defaults to a single column of ones, equivalent to treating the columns as repli
\item{contrast}{numeric vector or matrix specifying one or more contrasts of the linear model coefficients to be tested equal to zero. Number of rows must equal to the number of columns of \code{design}. If specified, then takes precedence over \code{coef}.}
\item{test}{which test (distribution) to use in calculating the p-values. Possible values are \code{"F"} or \code{"chisq"}.}
-
-\item{abundance.trend}{logical, whether to allow an abundance-dependent trend when estimating the prior values for the quasi-likelihood multiplicative dispersion parameter.}
-
-\item{robust}{logical, whether to estimate the prior.df robustly.}
-
-\item{winsor.tail.p}{numeric vector of length 2 giving proportion to trim (Winsorize) from lower and upper tail of the distribution of genewise deviances when estimating the hyperparameters. Positive values produce robust empirical Bayes ignoring outlier small or large deviances. Only used when \code{robust=TRUE}.}
-
-\item{plot}{logical, whether to plot the quasi-likelihood dispersion estimates vs abundance}
}
\value{
@@ -68,15 +57,14 @@ Defaults to a single column of ones, equivalent to treating the columns as repli
\item{fitted.values}{matrix of fitted values from glm fits, same number of rows and columns as \code{y}.}
\item{deviance}{numeric vector of deviances, one for each tag.}
-\code{glmLRT} and \code{glmQFTest} produce objects of class \code{DGELRT} with the same components as for \code{glmfit} plus the following:
- \item{table}{data frame with the same rows as \code{y} containing the log2-fold changes, test statistics and p-values, ready to be displayed by \code{topTags.}.}
+\code{glmLRT} produces objects of class \code{DGELRT} with the same components as for \code{glmfit} plus the following:
+ \item{table}{data frame with the same rows as \code{y} containing the log2-fold changes, likelhood ratio statistics and p-values, ready to be displayed by \code{topTags.}.}
\item{comparison}{character string describing the coefficient or the contrast being tested.}
The data frame \code{table} contains the following columns:
\item{logFC}{log2-fold change of expression between conditions being tested.}
\item{logCPM}{average log2-counts per million, the average taken over all libraries in \code{y}.}
- \item{LR}{likelihood ratio statistics (only for \code{glmLRT}).}
- \item{F}{F-statistics (only for \code{glmQFTest}).}
+ \item{LR}{likelihood ratio statistics.}
\item{PValue}{p-values.}
}
@@ -96,24 +84,12 @@ The returned coefficients are affected but not the likelihood ratio tests or p-v
If \code{coef} is used, the null hypothesis is that all the coefficients indicated by \code{coef} are equal to zero.
If \code{contrast} is non-null, then the null hypothesis is that the specified contrasts of the coefficients are equal to zero.
For example, a contrast of \code{c(0,1,-1)}, assuming there are three coefficients, would test the hypothesis that the second and third coefficients are equal.
-
-\code{glmQLFTest} implements the quasi-likelihood method of Lund et al (2012).
-It behaves the same as \code{glmLRT} except that it replaces likelihood ratio tests with
-quasi-likelihood F-tests for coefficients in the linear model.
-This function calls the limma function \code{\link{squeezeVar}} to conduct empirical Bayes smoothing of the genewise multiplicative dispersions.
-Note that the \code{QuasiSeq} package provides a alternative implementation of Lund et al (2012),
-with slightly different glm, trend and FDR methods.
}
\references{
McCarthy, DJ, Chen, Y, Smyth, GK (2012). Differential expression analysis of multifactor RNA-Seq experiments with respect to biological variation.
\emph{Nucleic Acids Research} 40, 4288-4297.
\url{http://nar.oxfordjournals.org/content/40/10/4288}
-
-Lund, SP, Nettleton, D, McCarthy, DJ, and Smyth, GK (2012).
-Detecting differential expression in RNA-sequence data using quasi-likelihood with shrunken dispersion estimates.
-\emph{Statistical Applications in Genetics and Molecular Biology} Volume 11, Issue 5, Article 8.
-\url{http://www.statsci.org/smyth/pubs/QuasiSeqPreprint.pdf}
}
\author{Davis McCarthy and Gordon Smyth}
@@ -153,9 +129,7 @@ d <- estimateGLMCommonDisp(d, design, verbose=TRUE)
\seealso{
Low-level computations are done by \code{\link{mglmOneGroup}} or \code{\link{mglmLevenberg}}.
-\code{\link{topTags}} displays results from \code{glmLRT} or \code{glmQLFTest}.
-
-The \code{QuasiSeq} package gives an alternative implementation of \code{glmQLFTest} based on the same statistical ideas.
+\code{\link{topTags}} displays results from \code{glmLRT}.
}
\keyword{models}
diff --git a/man/goana.Rd b/man/goana.Rd
new file mode 100644
index 0000000..8cb755f
--- /dev/null
+++ b/man/goana.Rd
@@ -0,0 +1,67 @@
+\name{goana.DGELRT}
+\alias{goana.DGEExact}
+\alias{goana.DGELRT}
+\title{Gene Ontology Analysis of Differentially Expressed Genes}
+\description{
+Test for over-representation of gene ontology (GO) terms in the up and down differentially expressed genes from a linear model fit.
+}
+\usage{
+\method{goana}{DGELRT}(de, geneid = rownames(de), FDR = 0.05, species = "Hs",
+ trend = FALSE, \dots)
+}
+\arguments{
+ \item{de}{an \code{DGELRT} object.}
+ \item{geneid}{Entrez Gene identifiers. Either a vector of length \code{nrow(de)} or the name of the column of \code{de$genes} containing the Entrez Gene IDs.}
+ \item{FDR}{false discovery rate cutoff for differentially expressed genes. Numeric value between 0 and 1.}
+ \item{species}{species identifier. Possible values are \code{"Hs"}, \code{"Mm"}, \code{"Rn"} or \code{"Dm"}.}
+ \item{trend}{adjust analysis for gene length or abundance?
+ Can be logical, or a numeric vector of covariate values, or the name of the column of \code{de$genes} containing the covariate values.
+ If \code{TRUE}, then \code{de$AveLogCPM} is used as the covariate.}
+ \item{\dots}{any other arguments are passed to \code{goana.default}.}
+}
+\details{
+Performs Gene Ontology enrichment analyses for the up and down differentially expressed genes from a linear model analysis.
+The Entrez Gene ID must be supplied for each gene.
+
+If \code{trend=FALSE}, the function computes one-sided hypergeometric tests equivalent to Fisher's exact test.
+
+If \code{trend=TRUE} or a covariate is supplied, then a trend is fitted to the differential expression results and the method of Young et al (2010) is used to adjust for this trend.
+The adjusted test uses Wallenius' noncentral hypergeometric distribution.
+}
+\value{
+A data frame with a row for each GO term and the following columns:
+ \item{Term}{GO term.}
+ \item{Ont}{ontology that the GO term belongs to. Possible values are \code{"BP"}, \code{"CC"} and \code{"MF"}.}
+ \item{N}{Number of genes in the GO term.}
+ \item{Up}{number of up-regulated differentially expressed genes.}
+ \item{Down}{number of down-regulated differentially expressed genes.}
+ \item{P.Up}{p-value for over-representation of GO term in up-regulated genes.}
+ \item{P.Down}{p-value for over-representation of GO term in down-regulated genes.}
+The row names of the data frame give the GO term IDs.
+}
+
+\references{
+ Young, M. D., Wakefield, M. J., Smyth, G. K., Oshlack, A. (2010).
+ Gene ontology analysis for RNA-seq: accounting for selection bias.
+ \emph{Genome Biology} 11, R14.
+ \url{http://genomebiology.com/2010/11/2/R14}
+}
+
+\seealso{
+\code{\link{goana}}, \code{\link{topGO}}
+}
+
+\author{Gordon Smyth and Yifang Hu}
+
+\examples{
+\dontrun{
+
+fit <- glmFit(y, design)
+lrt <- glmLRT(fit)
+go <- goana(lrt)
+topGO(go, ont="BP", sort = "up")
+topGO(go, ont="BP", sort = "down")
+}
+}
+
+\keyword{gene set test}
diff --git a/man/mglm.Rd b/man/mglm.Rd
index 512058e..883a9e4 100644
--- a/man/mglm.Rd
+++ b/man/mglm.Rd
@@ -12,8 +12,10 @@ Fit the same log-link negative binomial or Poisson generalized linear model (GLM
}
\usage{
-mglmOneGroup(y, dispersion=0, offset=0, weights=NULL, maxit=50, tol=1e-10, verbose=FALSE)
-mglmOneWay(y, design=NULL, dispersion=0, offset=0, weights=NULL, maxit=50, tol=1e-10)
+mglmOneGroup(y, dispersion=0, offset=0, weights=NULL, maxit=50, tol=1e-10,
+ verbose=FALSE, coef.start=NULL)
+mglmOneWay(y, design=NULL, dispersion=0, offset=0, weights=NULL, maxit=50,
+ tol=1e-10, coef.start=NULL)
mglmLevenberg(y, design, dispersion=0, offset=0, weights=NULL,
coef.start=NULL, start.method="null", maxit=200, tol=1e-06)
designAsFactor(design)
diff --git a/man/plotQLDisp.Rd b/man/plotQLDisp.Rd
new file mode 100644
index 0000000..68d2b4f
--- /dev/null
+++ b/man/plotQLDisp.Rd
@@ -0,0 +1,48 @@
+\title{Plot the quasi-likelihood dispersion}
+\name{plotQLDisp}
+\alias{plotQLDisp}
+\description{
+Plot the genewise quasi-likelihood dispersion against the gene abundance (in log2 counts per million).
+}
+\usage{
+plotQLDisp(glmfit, xlab="Average Log2 CPM", ylab="Quarter-Root Mean Deviance", pch=16, cex=0.2,
+ col.shrunk="red", col.trend="blue", col.raw="black", \dots)
+}
+\arguments{
+ \item{glmfit}{a \code{DGEGLM} object produced by \code{\link{glmQLFit}}.}
+ \item{xlab}{label for the x-axis.}
+ \item{ylab}{label for the y-axis.}
+ \item{pch}{the plotting symbol. See \code{\link{points}} for more details.}
+ \item{cex}{plot symbol expansion factor. See \code{\link{points}} for more details.}
+ \item{col.shrunk}{color of the points representing the shrunk quasi-liklihood dispersions.}
+ \item{col.trend}{color of line showing dispersion trend.}
+ \item{col.raw}{color of points showing the unshrunk dispersions.}
+ \item{\dots}{any other arguments are passed to \code{plot}.}
+}
+
+\details{
+This function displays the quarter-root of the quasi-likelihood dispersions for all genes, before and after shrinkage towards a trend.
+If \code{glmfit} was constructed without an abundance trend, the function instead plots a horizontal line (of colour \code{col.trend}) at the common value towards which dispersions are shrunk.
+The quarter-root transformation is applied to improve visibility for dispersions around unity.
+}
+
+\value{
+A plot is created on the current graphics device.
+}
+
+\author{Aaron Lun, based on code by Davis McCarthy and Gordon Smyth}
+
+\examples{
+nbdisp <- 1/rchisq(1000, df=10)
+y <- DGEList(matrix(rnbinom(6000, size = 1/nbdisp, mu = 10),1000,6))
+design <- model.matrix(~factor(c(1,1,1,2,2,2)))
+y <- estimateDisp(y, design)
+
+fit <- glmQLFit(y, design)
+plotQLDisp(fit)
+
+fit <- glmQLFit(y, design, abundance.trend=FALSE)
+plotQLDisp(fit)
+}
+
+\keyword{plot}
diff --git a/man/plotSpliceDGE.Rd b/man/plotSpliceDGE.Rd
new file mode 100644
index 0000000..4ef358f
--- /dev/null
+++ b/man/plotSpliceDGE.Rd
@@ -0,0 +1,26 @@
+\title{Plot exons on differentially spliced gene}
+\name{plotSpliceDGE}
+\alias{plotSpliceDGE}
+\description{
+Plot exons of differentially spliced gene.
+}
+\usage{
+plotSpliceDGE(lrt, geneid=NULL, rank=1L, FDR = 0.05)
+}
+\arguments{
+ \item{lrt}{\code{GLMLRT} object produced by \code{diffSpliceDGE}.}
+ \item{geneid}{character string, ID of the gene to plot.}
+ \item{rank}{integer, if \code{geneid=NULL} then this ranked gene will be plotted.}
+ \item{FDR}{numeric, mark exons with false discovery rate less than this cutoff.}
+}
+
+\details{
+Plots interaction log-fold-change by exon for the specified gene.
+}
+
+\value{A plot is created on the current graphics device.}
+\author{Yunshun Chen, Yifang Hu and Gordon Smyth}
+\seealso{
+\code{\link{diffSpliceDGE}}
+}
+\examples{# See \code{\link{diffSpliceDGE}}}
\ No newline at end of file
diff --git a/man/processAmplicons.Rd b/man/processAmplicons.Rd
new file mode 100644
index 0000000..a7efce8
--- /dev/null
+++ b/man/processAmplicons.Rd
@@ -0,0 +1,61 @@
+\name{processAmplicons}
+\alias{processAmplicons}
+
+\title{Process raw data from pooled genetic sequencing screens}
+
+\description{
+Given a list of sample-specific index (barcode) sequences and hairpin/sgRNA-specific sequences from an amplicon sequencing screen, generate a DGEList of counts from the raw fastq file/(s) containing the sequence reads.
+}
+
+\usage{
+processAmplicons(readfile, readfile2=NULL, barcodefile, hairpinfile,
+ barcodeStart=1, barcodeEnd=5,
+ barcodeStartRev=NULL, barcodeEndRev=NULL,
+ hairpinStart=37, hairpinEnd=57,
+ allowShifting=FALSE, shiftingBase=3,
+ allowMismatch=FALSE, barcodeMismatchBase=1,
+ hairpinMismatchBase=2, allowShiftedMismatch=FALSE,
+ verbose=FALSE)
+}
+
+\arguments{
+\item{readfile}{character vector giving one or more fastq filenames}
+\item{readfile2}{character vector giving one or more fastq filenames for reverse read, default to NULL}
+\item{barcodefile}{filename containing sample-specific barcode ids and sequences}
+\item{hairpinfile}{filename containing hairpin/sgRNA-specific ids and sequences}
+\item{barcodeStart}{numeric value, starting position (inclusive) of barcode sequence in reads}
+\item{barcodeEnd}{numeric value, ending position (inclusive) of barcode sequence in reads}
+\item{barcodeStartRev}{numeric value, starting position (inclusive) of barcode sequence in reverse reads, default to NULL}
+\item{barcodeEndRev}{numeric value, ending position (inclusive) of barcode sequence in reverse reads, default to NULL}
+\item{hairpinStart}{numeric value, starting position (inclusive) of hairpin/sgRNA sequence in reads}
+\item{hairpinEnd}{numeric value, ending position (inclusive) of hairpin/sgRNA sequence in reads}
+\item{allowShifting}{logical, indicates whether a given hairpin/sgRNA can be matched to a neighbouring position}
+\item{shiftingBase}{numeric value of maximum number of shifted bases from input \code{hairpinStart} and \code{hairpinEnd} should the program check for a hairpin/sgRNA match when \code{allowShifting} is \code{TRUE}}
+\item{allowMismatch}{logical, indicates whether sequence mismatch is allowed}
+\item{barcodeMismatchBase}{numeric value of maximum number of base sequence mismatches allowed in a barcode sequence when \code{allowShifting} is \code{TRUE}}
+\item{hairpinMismatchBase}{numeric value of maximum number of base sequence mismatches allowed in a hairpin/sgRNA sequence when \code{allowShifting} is \code{TRUE}}
+\item{allowShiftedMismatch}{logical, effective when \code{allowShifting} and \code{allowMismatch} are both \code{TRUE}. It indicates whether we check for sequence mismatches at a shifted position.}
+\item{verbose}{if \code{TRUE}, output program progress}
+}
+
+\value{Returns a \code{\link[edgeR:DGEList-class]{DGEList}} object with following components:
+ \item{counts}{read count matrix tallying up the number of reads with particular barcode and hairpin/sgRNA matches. Each row is a hairpin and each column is a sample}
+ \item{genes}{In this case, hairpin/sgRNA-specific information (ID, sequences, corresponding target gene) may be recorded in this data.frame}
+ \item{lib.size}{auto-calculated column sum of the counts matrix}
+}
+
+\details{
+The input barcode file and hairpin/sgRNA files are tab-separated text files with at least two columns (named 'ID' and 'Sequences') containing the sample or hairpin/sgRNA ids and a second column indicating the sample index or hairpin/sgRNA sequences to be matched. If \code{readfile2}, \code{barcodeStartRev} and \code{barcodeEndRev} are specified, a third column 'SequencesReverse' is expected in the barcode file. The barcode file may also contain a 'group' column that indicates which exper [...]
+
+To compute the count matrix, matching to the given barcodes and hairpins/sgRNAs is conducted in two rounds. The first round looks for an exact sequence match for the given barcode sequences and hairpin/sgRNA sequences at the locations specified. If \code{allowShifting} is set to \code{TRUE}, the program also checks if a given hairpin/sgRNA sequence can be found at a neighbouring position in the read. For hairpins/sgRNAs without a match, the program performs a second round of matching whi [...]
+
+The program outputs a \code{\link[edgeR:DGEList-class]{DGEList}} object, with a count matrix indicating the number of times each barcode and hairpin/sgRNA combination could be matched in reads from input fastq file/(s).
+
+For further examples and data, refer to the Case studies available from http://bioinf.wehi.edu.au/shRNAseq/.
+}
+
+\author{Zhiyin Dai and Matthew Ritchie}
+
+\references{
+Dai Z, Sheridan JM, et al. (2014). shRNA-seq data analysis with edgeR. F1000Research, http://f1000research.com/articles/10.12688/f1000research.4204/doi.
+}
diff --git a/man/processHairpinReads.Rd b/man/processHairpinReads.Rd
deleted file mode 100644
index c41c27c..0000000
--- a/man/processHairpinReads.Rd
+++ /dev/null
@@ -1,55 +0,0 @@
-\name{processHairpinReads}
-\alias{processHairpinReads}
-
-\title{Process raw data from shRNA-seq screens}
-
-\description{
-Given a list of barcode sequences and hairpin sequences from a shRNA-seq screen, generate a DGEList of counts from the raw fastq file/(s) containing the sequence reads.
-}
-
-\usage{
-processHairpinReads(readfile, barcodefile, hairpinfile,
- barcodeStart=1, barcodeEnd=5, hairpinStart=37, hairpinEnd=57,
- allowShifting=FALSE, shiftingBase = 3,
- allowMismatch=FALSE, barcodeMismatchBase = 1, hairpinMismatchBase = 2,
- allowShiftedMismatch=FALSE, verbose = FALSE)
-}
-
-\arguments{
-\item{readfile}{character vector giving one or more fastq filenames}
-\item{barcodefile}{filename containing barcode ids and sequences}
-\item{hairpinfile}{filename containing hairpin ids and sequences}
-\item{barcodeStart}{numeric value, starting position (inclusive) of barcode sequence in reads}
-\item{barcodeEnd}{numeric value, ending position (inclusive) of barcode sequence in reads}
-\item{hairpinStart}{numeric value, starting position (inclusive) of hairpin sequence in reads}
-\item{hairpinEnd}{numeric value, ending position (inclusive) of hairpin sequence in reads}
-\item{allowShifting}{logical, indicates whether a given hairpin can be matched to a neighbouring position}
-\item{shiftingBase}{numeric value of maximum number of shifted bases from input \code{hairpinStart} and \code{hairpinEnd} should the program check for a hairpin match when \code{allowShifting} is \code{TRUE}}
-\item{allowMismatch}{logical, indicates whether sequence mismatch is allowed}
-\item{barcodeMismatchBase}{numeric value of maximum number of base sequence mismatch allowed in barcode when \code{allowShifting} is \code{TRUE}}
-\item{hairpinMismatchBase}{numeric value of maximum number of base sequence mismatch allowed in hairpin when \code{allowShifting} is \code{TRUE}}
-\item{allowShiftedMismatch}{logical, effective when \code{allowShifting} and \code{allowMismatch} are both \code{TRUE}. It indicates whether we check for sequence mismatch at a shifted position.}
-\item{verbose}{if \code{TRUE}, output program progess}
-}
-
-\value{Returns a \code{\link[edgeR:DGEList-class]{DGEList}} object with following components:
- \item{counts}{read count matrix tallying up the number of reads with particular barcode and hairpin matches. Each row is a hairpin and each column is a sample}
- \item{genes}{In this case, hairpin information (ID, sequences, corresponding target gene) may be recorded in this data.frame}
- \item{lib.size}{auto-calculated column sum of the counts matrix}
-}
-
-\details{
-The input barcode file and hairpin files are tab-separated text files with at least two columns (named 'ID' and 'Sequences') containing the sample or hairpin ids and a second column indicating the sample index or hairpin sequences to be matched. The barcode file may also contain a 'group' column that indicates which experimental group a sample belongs to. Additional columns in each file will be included in the respective \code{$samples} or \code{$genes} data.frames of the final code{\l [...]
-
-To compute the count matrix, the matching to given barcodes and hairpins is conducted in two rounds. The first round looks for an exact sequence match. The program checks for a match from given barcode sequences and hairpin sequences at specified location. If \code{allowShifting} is set to \code{TRUE}, the program also checks if a given hairpin sequence can be found at a neighbouring position in read. For hairpins without a match, the program performs a second round of mapping which allo [...]
-
-The program outputs a \code{\link[edgeR:DGEList-class]{DGEList}} object, with a count matrix indicating the number of times each barcode and hairpin combination could be matched in reads from input fastq file/(s).
-
-For further examples and data, refer to the Case studies available from http://bioinf.wehi.edu.au/shRNAseq/.
-}
-
-\author{Zhiyin Dai, Matthew Ritchie}
-
-\references{
-Dai Z, Sheridan JM, et al. (submitted, 2014). shRNA-seq data analysis with edgeR. \emph{submitted}.
-}
diff --git a/man/topSpliceDGE.Rd b/man/topSpliceDGE.Rd
new file mode 100644
index 0000000..ef264a8
--- /dev/null
+++ b/man/topSpliceDGE.Rd
@@ -0,0 +1,33 @@
+\title{Top table of differentially spliced genes or exons}
+\name{topSpliceDGE}
+\alias{topSpliceDGE}
+\description{
+Top table ranking the most differentially spliced genes or exons.
+}
+\usage{
+topSpliceDGE(lrt, level="gene", gene.test="Simes", number=10, FDR=1)
+}
+\arguments{
+ \item{lrt}{\code{DGELRT} object produced by \code{diffSpliceDGE}.}
+ \item{level}{character string, should the table be by \code{"exon"} or by \code{"gene"}.}
+ \item{gene.test}{character string, choice for the gene-level p-values. Possible values are "Simes" and "F".}
+ \item{number}{integer, maximum number of rows to output.}
+ \item{FDR}{numeric, only show exons or genes with false discovery rate less than this cutoff.}
+}
+
+\details{
+Ranks exons or genes by p-values.
+}
+
+\value{A data.frame with any annotation columns found in \code{fit} plus the following columns
+ \item{NExons}{number of exons if \code{level="gene"}}
+ \item{Gene.Exon}{exon annotation if \code{level="exon"}}
+ \item{logFC}{log-fold change of one exon vs all the exons for the same gene (if \code{level="exon"})}
+ \item{F}{F-statistics for exons if \code{level="exon"}}
+ \item{P.Value}{p-value}
+ \item{FDR}{false discovery rate}
+}
+
+\author{Yunshun Chen and Gordon Smyth}
+
+\examples{# See \code{\link{diffSpliceDGE}}}
diff --git a/man/treatDGE.Rd b/man/treatDGE.Rd
new file mode 100644
index 0000000..53dbf95
--- /dev/null
+++ b/man/treatDGE.Rd
@@ -0,0 +1,62 @@
+\name{treatDGE}
+\alias{treatDGE}
+
+\title{Testing for Differential Expression Relative to a Threshold}
+
+\description{Conduct genewise statistical tests for a given coefficient or coefficient contrast relative to a specified threshold.}
+
+\usage{
+treatDGE(glmfit, coef=ncol(glmfit$design), contrast=NULL, lfc=0)
+}
+
+\arguments{
+\item{glmfit}{a \code{DGEGLM} object, usually output from \code{glmFit}.}
+
+\item{coef}{integer or character vector indicating which coefficients of the linear model are to be tested equal to zero. Values must be columns or column names of \code{design}. Defaults to the last coefficient. Ignored if \code{contrast} is specified.}
+
+\item{contrast}{numeric vector specifying the contrast of the linear model coefficients to be tested against the log2-fold change threshold. Length must equal to the number of columns of \code{design}. If specified, then takes precedence over \code{coef}.}
+
+\item{lfc}{numeric scalar specifying the absolute value of the log2-fold change threshold above which differential expression is to be considered.}
+}
+
+\value{
+\code{treatDGE} produces an object of class \code{DGELRT} with the same components as for \code{glmfit} plus the following:
+ \item{lfc}{absolute value of the specified log2-fold change threshold.}
+ \item{table}{data frame with the same rows as \code{glmfit} containing the log2-fold changes, average log2-counts per million and p-values, ready to be displayed by \code{topTags.}.}
+ \item{comparison}{character string describing the coefficient or the contrast being tested.}
+
+The data frame \code{table} contains the following columns:
+ \item{logFC}{log2-fold change of expression between conditions being tested.}
+ \item{logCPM}{average log2-counts per million, the average taken over all libraries.}
+ \item{PValue}{p-values.}
+}
+
+\details{
+\code{treatDGE} implements a two-sided modified likelihood ratio test.
+}
+
+\author{Yunshun Chen and Gordon Smyth}
+
+\examples{
+ngenes <- 100
+n1 <- 3
+n2 <- 3
+nlibs <- n1+n2
+mu <- 100
+phi <- 0.1
+group <- c(rep(1,n1), rep(2,n2))
+design <- model.matrix(~as.factor(group))
+
+### 4-fold change for the first 5 genes
+i <- 1:5
+fc <- 4
+mu <- matrix(mu, ngenes, nlibs)
+mu[i, 1:n1] <- mu[i, 1:n1]*fc
+
+counts <- matrix(rnbinom(ngenes*nlibs, mu=mu, size=1/phi), ngenes, nlibs)
+d <- DGEList(counts=counts,lib.size=rep(1e6, nlibs), group=group)
+
+gfit <- glmFit(d, design, dispersion=phi)
+tr <- treatDGE(gfit, coef=2, lfc=1)
+topTags(tr)
+}
diff --git a/src/Makevars b/src/Makevars
index 5569601..004bc02 100644
--- a/src/Makevars
+++ b/src/Makevars
@@ -1,4 +1 @@
-CHECK=#-Wall -pedantic
PKG_LIBS=$(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)
-PKG_CPPFLAGS=$(CHECK)
-PKG_CFLAGS=$(CHECK)
diff --git a/src/R_compute_nbdev.cpp b/src/R_compute_nbdev.cpp
index 89e26ed..dd6551f 100644
--- a/src/R_compute_nbdev.cpp
+++ b/src/R_compute_nbdev.cpp
@@ -1,26 +1,23 @@
#include "utils.h"
#include "glm.h"
-#include <iostream>
-
-extern "C" {
SEXP R_compute_nbdev (SEXP y, SEXP mu, SEXP phi) try {
- if (!IS_NUMERIC(phi)) { throw std::runtime_error("dispersion vector should be double-precision"); }
+ if (!isNumeric(phi)) { throw std::runtime_error("dispersion vector should be double-precision"); }
const int ntags=LENGTH(phi);
- if (!IS_NUMERIC(y)) { throw std::runtime_error("count matrix should be double-precision"); }
- if (!IS_NUMERIC(mu)) { throw std::runtime_error("matrix of means should be double-precision"); }
+ if (!isNumeric(y)) { throw std::runtime_error("count matrix should be double-precision"); }
+ if (!isNumeric(mu)) { throw std::runtime_error("matrix of means should be double-precision"); }
const int nlib=LENGTH(mu)/ntags;
if (nlib*ntags !=LENGTH(mu)) { throw std::runtime_error("mean matrix has inconsistent dimensions"); }
if (LENGTH(mu)!=LENGTH(y)) { throw std::runtime_error("count and mean matrices should have same dimensions"); }
- const double* yptr=NUMERIC_POINTER(y);
- const double* mptr=NUMERIC_POINTER(mu);
- const double* dptr=NUMERIC_POINTER(phi);
+ const double* yptr=REAL(y);
+ const double* mptr=REAL(mu);
+ const double* dptr=REAL(phi);
// Running through each row and computing the unit deviance, and then that sum.
SEXP output=PROTECT(allocMatrix(REALSXP, ntags, nlib));
try {
- double* optr=NUMERIC_POINTER(output);
+ double* optr=REAL(output);
int counter;
for (int i=0; i<ntags; ++i) {
counter=0;
@@ -41,5 +38,3 @@ SEXP R_compute_nbdev (SEXP y, SEXP mu, SEXP phi) try {
} catch(std::exception& e) {
return mkString(e.what());
}
-
-}
diff --git a/src/R_cr_adjust.cpp b/src/R_cr_adjust.cpp
index aca987c..4ed4a81 100644
--- a/src/R_cr_adjust.cpp
+++ b/src/R_cr_adjust.cpp
@@ -1,9 +1,6 @@
#include "glm.h"
-extern "C" {
-
-/*
- * 'w' represents a matrix of negative binomial probabilities (i.e.
+/* 'w' represents a matrix of negative binomial probabilities (i.e.
* the prob of success/failure, a function of mean and dispersion)
* whereas 'x' represents the design matrix. This function calculates
* the multiplication of the matrices, then performs a Cholesky decomposition
@@ -11,19 +8,19 @@ extern "C" {
* be used to compute the Cox-Reid adjustment factor.
*/
SEXP R_cr_adjust (SEXP w, SEXP x, SEXP nlibs) try {
- if (!IS_NUMERIC(w)) { throw std::runtime_error("matrix of likelihoods must be double precision"); }
- if (!IS_NUMERIC(x)) { throw std::runtime_error("design matrix must be double precision"); }
+ if (!isNumeric(w)) { throw std::runtime_error("matrix of likelihoods must be double precision"); }
+ if (!isNumeric(x)) { throw std::runtime_error("design matrix must be double precision"); }
- const int num_libs=INTEGER_VALUE(nlibs);
+ const int num_libs=asInteger(nlibs);
const int num_tags=LENGTH(w)/num_libs;
const int num_coefs=LENGTH(x)/num_libs;
// Setting up a couple of indices for quick access.
- adj_coxreid acr(num_libs, num_coefs, NUMERIC_POINTER(x));
- double* wptr=NUMERIC_POINTER(w);
+ adj_coxreid acr(num_libs, num_coefs, REAL(x));
+ const double* wptr=REAL(w);
- SEXP output=PROTECT(NEW_NUMERIC(num_tags));
- double* out_ptr=NUMERIC_POINTER(output);
+ SEXP output=PROTECT(allocVector(REALSXP, num_tags));
+ double* out_ptr=REAL(output);
try {
// Running through each tag.
for (int tag=0; tag<num_tags; ++tag) {
@@ -44,5 +41,3 @@ SEXP R_cr_adjust (SEXP w, SEXP x, SEXP nlibs) try {
UNPROTECT(1);
return output;
} catch (std::exception& e) { return mkString(e.what()); }
-
-}
diff --git a/src/R_exact_test_by_deviance.cpp b/src/R_exact_test_by_deviance.cpp
index 5743e05..57ba811 100644
--- a/src/R_exact_test_by_deviance.cpp
+++ b/src/R_exact_test_by_deviance.cpp
@@ -1,21 +1,14 @@
#include "utils.h"
#include "glm.h"
-extern "C" {
#include "Rmath.h"
-}
-#ifdef DEBUG
-#include <iostream>
-#endif
-
-extern "C" {
SEXP R_exact_test_by_deviance(SEXP sums_1, SEXP sums_2, SEXP n_1, SEXP n_2, SEXP disp) try {
- if (!IS_INTEGER(n_1) || LENGTH(n_1)!=1 || !IS_INTEGER(n_2) || LENGTH(n_2)!=1) {
+ if (!isInteger(n_1) || LENGTH(n_1)!=1 || !isInteger(n_2) || LENGTH(n_2)!=1) {
throw std::runtime_error("number of libraries must be integer scalars"); }
- if (!IS_NUMERIC(disp)) { throw std::runtime_error("dispersion must be a double precision vector"); }
- if (!IS_INTEGER(sums_1) || !IS_INTEGER(sums_2)) { throw std::runtime_error("sums must be integer vectors"); }
+ if (!isNumeric(disp)) { throw std::runtime_error("dispersion must be a double precision vector"); }
+ if (!isInteger(sums_1) || !isInteger(sums_2)) { throw std::runtime_error("sums must be integer vectors"); }
- const int n1=INTEGER_VALUE(n_1), n2=INTEGER_VALUE(n_2);
+ const int n1=asInteger(n_1), n2=asInteger(n_2);
const int nlibs = n1+n2;
const int ntags=LENGTH(sums_1);
if (ntags!=LENGTH(sums_2) || ntags!=LENGTH(disp)) {
@@ -23,12 +16,12 @@ SEXP R_exact_test_by_deviance(SEXP sums_1, SEXP sums_2, SEXP n_1, SEXP n_2, SEXP
} else if (n1<=0 || n2 <=0) {
throw std::runtime_error("number of libraries must be positive for each condition");
}
- const int* s1_ptr=INTEGER_POINTER(sums_1), *s2_ptr=INTEGER_POINTER(sums_2);
- const double *d_ptr=NUMERIC_POINTER(disp);
+ const int* s1_ptr=INTEGER(sums_1), *s2_ptr=INTEGER(sums_2);
+ const double *d_ptr=REAL(disp);
- SEXP output=PROTECT(NEW_NUMERIC(ntags));
+ SEXP output=PROTECT(allocVector(REALSXP, ntags));
try{
- double* p_ptr=NUMERIC_POINTER(output);
+ double* p_ptr=REAL(output);
for (int i=0; i<ntags; ++i) {
const int& s1=s1_ptr[i];
const int& s2=s2_ptr[i];
@@ -74,5 +67,3 @@ SEXP R_exact_test_by_deviance(SEXP sums_1, SEXP sums_2, SEXP n_1, SEXP n_2, SEXP
UNPROTECT(1);
return output;
} catch (std::exception& e) { return mkString(e.what()); }
-
-}
diff --git a/src/R_levenberg.cpp b/src/R_levenberg.cpp
index 9185a45..7149c89 100644
--- a/src/R_levenberg.cpp
+++ b/src/R_levenberg.cpp
@@ -1,27 +1,25 @@
#include "glm.h"
#include "matvec_check.h"
-extern "C" {
-
SEXP R_levenberg (SEXP nlib, SEXP ntag, SEXP design, SEXP counts, SEXP disp, SEXP offset, SEXP weights,
SEXP beta, SEXP fitted, SEXP tol, SEXP maxit) try {
- if (!IS_NUMERIC(design)) { throw std::runtime_error("design matrix should be double precision"); }
- if (!IS_NUMERIC(disp)) { throw std::runtime_error("dispersion vector should be double precision"); }
- if (!IS_NUMERIC(beta)) { throw std::runtime_error("matrix of start values for coefficients should be double precision"); }
- if (!IS_NUMERIC(fitted)) { throw std::runtime_error("matrix of starting fitted values should be double precision"); }
- const int num_tags=INTEGER_VALUE(ntag);
- const int num_libs=INTEGER_VALUE(nlib);
+ if (!isNumeric(design)) { throw std::runtime_error("design matrix should be double precision"); }
+ if (!isNumeric(disp)) { throw std::runtime_error("dispersion vector should be double precision"); }
+ if (!isNumeric(beta)) { throw std::runtime_error("matrix of start values for coefficients should be double precision"); }
+ if (!isNumeric(fitted)) { throw std::runtime_error("matrix of starting fitted values should be double precision"); }
+ const int num_tags=asInteger(ntag);
+ const int num_libs=asInteger(nlib);
// Checking the count matrix.
const double *cdptr=NULL;
const int* ciptr=NULL;
double* count_ptr=(double*)R_alloc(num_libs, sizeof(double));
- bool is_integer=IS_INTEGER(counts);
+ bool is_integer=isInteger(counts);
if (is_integer) {
- ciptr=INTEGER_POINTER(counts);
+ ciptr=INTEGER(counts);
} else {
- if (!IS_NUMERIC(counts)) { throw std::runtime_error("count matrix must be integer or double-precision"); }
- cdptr=NUMERIC_POINTER(counts);
+ if (!isNumeric(counts)) { throw std::runtime_error("count matrix must be integer or double-precision"); }
+ cdptr=REAL(counts);
}
// Getting and checking the dimensions of the arguments.
@@ -40,30 +38,30 @@ SEXP R_levenberg (SEXP nlib, SEXP ntag, SEXP design, SEXP counts, SEXP disp, SEX
}
// Initializing pointers to the assorted features.
- const double* beta_ptr=NUMERIC_POINTER(beta),
- *design_ptr=NUMERIC_POINTER(design),
- *fitted_ptr=NUMERIC_POINTER(fitted),
- *disp_ptr=NUMERIC_POINTER(disp);
+ const double* beta_ptr=REAL(beta),
+ *design_ptr=REAL(design),
+ *fitted_ptr=REAL(fitted),
+ *disp_ptr=REAL(disp);
matvec_check allo(num_libs, num_tags, offset, true, "offset", false);
const double* const* optr2=allo.access();
matvec_check allw(num_libs, num_tags, weights, true, "weight", true);
const double* const* wptr2=allw.access();
// Initializing output cages.
- SEXP output=PROTECT(NEW_LIST(5));
+ SEXP output=PROTECT(allocVector(VECSXP, 5));
SET_VECTOR_ELT(output, 0, allocMatrix(REALSXP, num_coefs, num_tags)); // beta (transposed)
SET_VECTOR_ELT(output, 1, allocMatrix(REALSXP, num_libs, num_tags)); // new fitted (transposed)
- SET_VECTOR_ELT(output, 2, NEW_NUMERIC(num_tags));
- SET_VECTOR_ELT(output, 3, NEW_INTEGER(num_tags));
- SET_VECTOR_ELT(output, 4, NEW_LOGICAL(num_tags));
- double* new_beta_ptr=NUMERIC_POINTER(VECTOR_ELT(output, 0));
- double* new_fitted_ptr=NUMERIC_POINTER(VECTOR_ELT(output, 1));
- double* dev_ptr=NUMERIC_POINTER(VECTOR_ELT(output, 2));
- int* iter_ptr=INTEGER_POINTER(VECTOR_ELT(output, 3));
- int* fail_ptr=LOGICAL_POINTER(VECTOR_ELT(output, 4));
+ SET_VECTOR_ELT(output, 2, allocVector(REALSXP, num_tags));
+ SET_VECTOR_ELT(output, 3, allocVector(INTSXP, num_tags));
+ SET_VECTOR_ELT(output, 4, allocVector(LGLSXP, num_tags));
+ double* new_beta_ptr=REAL(VECTOR_ELT(output, 0));
+ double* new_fitted_ptr=REAL(VECTOR_ELT(output, 1));
+ double* dev_ptr=REAL(VECTOR_ELT(output, 2));
+ int* iter_ptr=INTEGER(VECTOR_ELT(output, 3));
+ int* fail_ptr=LOGICAL(VECTOR_ELT(output, 4));
try {
// Running through each tag and fitting the NB GLM.
- glm_levenberg glbg(num_libs, num_coefs, design_ptr, INTEGER_VALUE(maxit), NUMERIC_VALUE(tol));
+ glm_levenberg glbg(num_libs, num_coefs, design_ptr, asInteger(maxit), asReal(tol));
for (int tag=0; tag<num_tags; ++tag) {
// Copying integer/double counts to a new vector.
@@ -108,6 +106,3 @@ SEXP R_levenberg (SEXP nlib, SEXP ntag, SEXP design, SEXP counts, SEXP disp, SEX
UNPROTECT(1);
return output;
} catch (std::exception& e) { return mkString(e.what()); }
-
-}
-
diff --git a/src/R_loess_by_col.cpp b/src/R_loess_by_col.cpp
index 9bab040..c442dee 100644
--- a/src/R_loess_by_col.cpp
+++ b/src/R_loess_by_col.cpp
@@ -1,7 +1,5 @@
#include "utils.h"
-extern "C" {
-
/* 'y' is a m by n matrix where each column corresponds to a different
* dataset and each row corresponds to a point in 'x'. 'x' is a 'm'-long
* vector containing sorted x-coordinates. 's' describes the span.
@@ -9,32 +7,32 @@ extern "C" {
SEXP R_loess_by_col(SEXP x, SEXP y, SEXP n_cols, SEXP s) try {
// Setting up input data structures.
- if (!IS_NUMERIC(x)) { throw std::runtime_error("vector of covariates must be double precision"); }
- if (!IS_NUMERIC(y)) { throw std::runtime_error("vector of reponses must be double precision"); }
+ if (!isNumeric(x)) { throw std::runtime_error("vector of covariates must be double precision"); }
+ if (!isNumeric(y)) { throw std::runtime_error("vector of reponses must be double precision"); }
const int total=LENGTH(x);
- const int span=INTEGER_VALUE(s);
+ const int span=asInteger(s);
if (span>total) {
throw std::runtime_error("number of smoothing points should less than the total number of points");
} else if (span<=0) {
throw std::runtime_error("number of smoothing points should be positive");
}
- const double* x_ptr=NUMERIC_POINTER(x);
- const int ncols=INTEGER_VALUE(n_cols);
+ const double* x_ptr=REAL(x);
+ const int ncols=asInteger(n_cols);
if (LENGTH(y)!=ncols*total) {
throw std::runtime_error("supplied dimensions for matrix 'y' are not consistent");
}
std::deque<const double*> y_ptrs;
- for (int i=0; i<ncols; ++i) { y_ptrs.push_back(i==0 ? NUMERIC_POINTER(y) : y_ptrs[i-1]+total); }
+ for (int i=0; i<ncols; ++i) { y_ptrs.push_back(i==0 ? REAL(y) : y_ptrs[i-1]+total); }
// Setting up output vectors.
SEXP output;
- PROTECT(output=NEW_LIST(2));
+ PROTECT(output=allocVector(VECSXP, 2));
SET_VECTOR_ELT(output, 0, allocMatrix(REALSXP, total, ncols));
- SET_VECTOR_ELT(output, 1, NEW_NUMERIC(total));
+ SET_VECTOR_ELT(output, 1, allocVector(REALSXP, total));
std::deque<double*> f_ptrs;
- for (int i=0; i<ncols; ++i) { f_ptrs.push_back(i==0 ? NUMERIC_POINTER(VECTOR_ELT(output, 0)) : f_ptrs[i-1]+total); }
- double* w_ptr=NUMERIC_POINTER(VECTOR_ELT(output, 1));
+ for (int i=0; i<ncols; ++i) { f_ptrs.push_back(i==0 ? REAL(VECTOR_ELT(output, 0)) : f_ptrs[i-1]+total); }
+ double* w_ptr=REAL(VECTOR_ELT(output, 1));
/* First we determine which of the x-axis values are closest together. This means
* that we go through all points to determine which 'frame' brings gets the closest
@@ -135,5 +133,3 @@ SEXP R_loess_by_col(SEXP x, SEXP y, SEXP n_cols, SEXP s) try {
} catch (std::exception& e) {
return mkString(e.what());
}
-
-}
diff --git a/src/R_maximize_interpolant.cpp b/src/R_maximize_interpolant.cpp
index bce02a5..1be3ab1 100644
--- a/src/R_maximize_interpolant.cpp
+++ b/src/R_maximize_interpolant.cpp
@@ -1,23 +1,20 @@
#include "utils.h"
#include "interpolator.h"
-extern "C" {
-
SEXP R_maximize_interpolant(SEXP spline_pts, SEXP likelihoods) try {
- if (!IS_NUMERIC(spline_pts)) { std::runtime_error("spline points should be a double precision vector"); }
- if (!IS_NUMERIC(likelihoods)) { std::runtime_error("likelihoods should be a double precision matrix"); }
+ if (!isNumeric(spline_pts)) { std::runtime_error("spline points should be a double precision vector"); }
+ if (!isNumeric(likelihoods)) { std::runtime_error("likelihoods should be a double precision matrix"); }
// Loading in the spline x-axis values.
const int num_pts=LENGTH(spline_pts);
- double* sptr=NUMERIC_POINTER(spline_pts);
- double* lptr=NUMERIC_POINTER(likelihoods);
+ double* sptr=REAL(spline_pts);
+ double* lptr=REAL(likelihoods);
const int nrows=LENGTH(likelihoods)/num_pts;
interpolator maxinterpol(num_pts);
// Setting up the output object and running through it.
- SEXP output;
- PROTECT(output=NEW_NUMERIC(nrows));
- double* out_ptr=NUMERIC_POINTER(output);
+ SEXP output=PROTECT(allocVector(REALSXP, nrows));
+ double* out_ptr=REAL(output);
try {
for (int count=0; count<nrows; ++count) {
*out_ptr=maxinterpol.find_max(sptr, lptr);
@@ -31,6 +28,3 @@ SEXP R_maximize_interpolant(SEXP spline_pts, SEXP likelihoods) try {
UNPROTECT(1);
return(output);
} catch (std::exception& e) { return mkString(e.what()); }
-
-}
-
diff --git a/src/R_one_group.cpp b/src/R_one_group.cpp
index a1bffd5..20388f6 100644
--- a/src/R_one_group.cpp
+++ b/src/R_one_group.cpp
@@ -1,41 +1,46 @@
#include "glm.h"
#include "matvec_check.h"
-extern "C" {
-
-SEXP R_one_group (SEXP nt, SEXP nl, SEXP y, SEXP disp, SEXP offsets, SEXP weights, SEXP max_iterations, SEXP tolerance) try {
- const int num_tags=INTEGER_VALUE(nt);
- const int num_libs=INTEGER_VALUE(nl);
+SEXP R_one_group (SEXP y, SEXP disp, SEXP offsets, SEXP weights, SEXP max_iterations, SEXP tolerance, SEXP beta) try {
+ if (!isNumeric(disp)) { throw std::runtime_error("dispersion vector must be double precision"); }
+ const int num_tags=LENGTH(disp);
+ const int num_libs=LENGTH(y)/num_tags;
if (num_tags*num_libs != LENGTH(y) ) { throw std::runtime_error("dimensions of the count table are not as specified"); } // Checking that it is an exact division.
- const int maxit=INTEGER_VALUE(max_iterations);
- const double tol=NUMERIC_VALUE(tolerance);
- if (!IS_NUMERIC(disp)) { throw std::runtime_error("dispersion vector must be double precision"); }
- if (LENGTH(disp)!=num_tags) { throw std::runtime_error("length of dispersion vector must be 1 or equal to the number of tags"); }
-
+ if (!isNumeric(beta)) { throw std::runtime_error("beta start vector must be double precision"); }
+ const int blen=LENGTH(beta);
+ const bool use_old_start=(blen!=0);
+ if (use_old_start && blen!=num_tags) {
+ throw std::runtime_error("non-empty start vector must have length equal to the number of tags");
+ }
+ const double* bsptr=REAL(beta);
+
+ const int maxit=asInteger(max_iterations);
+ const double tol=asReal(tolerance);
+
// Setting up some iterators. We provide some flexibility to detecting numeric-ness.
double *ydptr=NULL;
int* yiptr=NULL;
double* yptr=(double*)R_alloc(num_libs, sizeof(double));
- bool is_integer=IS_INTEGER(y);
+ bool is_integer=isInteger(y);
if (is_integer) {
- yiptr=INTEGER_POINTER(y);
+ yiptr=INTEGER(y);
} else {
- if (!IS_NUMERIC(y)) { throw std::runtime_error("count matrix must be integer or double-precision"); }
- ydptr=NUMERIC_POINTER(y);
+ if (!isNumeric(y)) { throw std::runtime_error("count matrix must be integer or double-precision"); }
+ ydptr=REAL(y);
}
matvec_check allo(num_libs, num_tags, offsets, false, "offset", false);
const double* const* optr2=allo.access();
matvec_check allw(num_libs, num_tags, weights, false, "weight", true);
const double* const* wptr2=allw.access();
- double* dptr=NUMERIC_POINTER(disp);
+ double* dptr=REAL(disp);
// Setting up beta for output.
- SEXP output=PROTECT(NEW_LIST(2));
- SET_VECTOR_ELT(output, 0, NEW_NUMERIC(num_tags));
- SET_VECTOR_ELT(output, 1, NEW_LOGICAL(num_tags));
- double* bptr=NUMERIC_POINTER(VECTOR_ELT(output, 0));
- int* cptr=LOGICAL_POINTER(VECTOR_ELT(output, 1));
+ SEXP output=PROTECT(allocVector(VECSXP, 2));
+ SET_VECTOR_ELT(output, 0, allocVector(REALSXP, num_tags));
+ SET_VECTOR_ELT(output, 1, allocVector(LGLSXP, num_tags));
+ double* bptr=REAL(VECTOR_ELT(output, 0));
+ int* cptr=LOGICAL(VECTOR_ELT(output, 1));
try {
// Iterating through tags and fitting.
@@ -53,13 +58,10 @@ SEXP R_one_group (SEXP nt, SEXP nl, SEXP y, SEXP disp, SEXP offsets, SEXP weight
#ifdef WEIGHTED
*wptr2,
#endif
- yptr, *dptr);
+ yptr, dptr[tag], (use_old_start ? bsptr[tag] : R_NaReal));
- (*bptr)=out.first;
- (*cptr)=out.second;
- ++bptr;
- ++cptr;
- ++dptr;
+ bptr[tag]=out.first;
+ cptr[tag]=out.second;
allo.advance();
allw.advance();
}
@@ -74,5 +76,3 @@ SEXP R_one_group (SEXP nt, SEXP nl, SEXP y, SEXP disp, SEXP offsets, SEXP weight
} catch (std::exception& e) {
return mkString(e.what());
}
-
-}
diff --git a/src/R_process_hairpin_reads.c b/src/R_process_hairpin_reads.c
index 2afb799..df0c36b 100644
--- a/src/R_process_hairpin_reads.c
+++ b/src/R_process_hairpin_reads.c
@@ -6,31 +6,35 @@
#include <time.h>
#define MAX_BARCODE 1000
-#define MAX_HAIRPIN 10000
-#define SEQ_LEN 100
+#define MAX_HAIRPIN 100000
#define BLOCKSIZE 10000000
typedef struct {
char *sequence;
+ char *sequenceRev;
int original_pos;
} a_barcode;
typedef struct {
char *sequence;
int original_pos;
- long count;
} a_hairpin;
a_barcode *barcodes[MAX_BARCODE];
a_hairpin *hairpins[MAX_HAIRPIN];
+int isPairedReads;
int num_barcode;
int num_hairpin;
long num_read;
+long hairpinreadcount[MAX_HAIRPIN];
long summary[MAX_HAIRPIN][MAX_BARCODE];
int barcode_start;
int barcode_end;
+int barcode_start_rev;
+int barcode_end_rev;
int barcode_length;
+int barcode_length_rev;
int hairpin_start;
int hairpin_end;
int hairpin_length;
@@ -48,8 +52,6 @@ long hairpincount;
long bchpcount;
a_hairpin *mismatch_hairpins[MAX_HAIRPIN];
-int *barcodeindex;
-int *hairpinindex;
void
Read_In_Barcodes(char* filename){
@@ -63,18 +65,28 @@ Read_In_Barcodes(char* filename){
a_barcode *new_barcode;
int count = 0;
+ char * token;
+
while ((readline = fgets(line, len, fin)) != NULL){
count++;
new_barcode = (a_barcode *)malloc(sizeof(a_barcode));
- new_barcode->sequence = (char *)malloc(SEQ_LEN * sizeof(char));
-
- new_barcode->original_pos = count;
+ new_barcode->sequence = (char *)malloc(barcode_length * sizeof(char));
strncpy(new_barcode->sequence, line, barcode_length);
+ new_barcode->original_pos = count;
+ if (isPairedReads > 0) {
+ token = strtok(line, "\t");
+ token = strtok(NULL, "\t");
+ new_barcode->sequenceRev = (char *)malloc(barcode_length_rev * sizeof(char));
+ strncpy(new_barcode->sequenceRev, token, barcode_length_rev);
+ } else {
+ new_barcode->sequenceRev = NULL;
+ };
barcodes[count] = new_barcode;
}
fclose(fin);
num_barcode = count;
free(line);
+
Rprintf(" -- Number of Barcodes : %d\n", num_barcode);
}
@@ -87,18 +99,47 @@ locate_barcode(char *a_barcode){
while (imax >= imin) {
imid = (imax + imin) / 2;
- if (strncmp(barcodes[imid]->sequence, a_barcode, barcode_length) < 0)
+ if (strncmp(barcodes[imid]->sequence, a_barcode, barcode_length) < 0) {
imin = imid + 1;
- else if (strncmp(barcodes[imid]->sequence, a_barcode, barcode_length) > 0)
+ } else if (strncmp(barcodes[imid]->sequence, a_barcode, barcode_length) > 0) {
imax = imid - 1;
- else
+ } else {
return barcodes[imid]->original_pos;
+ }
}
return -1;
}
int
-locate_hairpin(char *a_hairpin){
+locate_barcode_paired(char *a_barcode, char *a_barcode_rev){
+ int imin, imax, imid;
+ imin = 1;
+ imax = num_barcode;
+
+ while (imax >= imin) {
+ imid = (imax + imin) / 2;
+ // compare forward barcode sequence
+ if (strncmp(barcodes[imid]->sequence, a_barcode, barcode_length) < 0) {
+ imin = imid + 1;
+ } else if (strncmp(barcodes[imid]->sequence, a_barcode, barcode_length) > 0) {
+ imax = imid - 1;
+ } else {
+ // same forward sequence, compare reverse barcode sequence
+ if (strncmp(barcodes[imid]->sequenceRev, a_barcode_rev, barcode_length_rev) < 0) {
+ imin = imid + 1;
+ } else if (strncmp(barcodes[imid]->sequenceRev, a_barcode_rev, barcode_length_rev) > 0) {
+ imax = imid - 1;
+ } else {
+ return barcodes[imid]->original_pos;
+ }
+ }
+ }
+ return -1;
+}
+
+
+int
+locate_hairpin_impl(char *a_hairpin){
int imin, imax, imid;
imin = 1;
imax = num_hairpin;
@@ -106,12 +147,13 @@ locate_hairpin(char *a_hairpin){
while (imax >= imin) {
imid = (imax + imin) / 2;
- if (strncmp(hairpins[imid]->sequence, a_hairpin, hairpin_length) < 0)
+ if (strncmp(hairpins[imid]->sequence, a_hairpin, hairpin_length) < 0) {
imin = imid + 1;
- else if (strncmp(hairpins[imid]->sequence, a_hairpin, hairpin_length) > 0)
+ } else if (strncmp(hairpins[imid]->sequence, a_hairpin, hairpin_length) > 0) {
imax = imid - 1;
- else
+ } else {
return hairpins[imid]->original_pos;
+ }
}
return -1;
}
@@ -121,13 +163,15 @@ Valid_Match(char *sequence1, char *sequence2, int length, int threshold){
int i_base;
int mismatchbasecount = 0;
for (i_base = 0; i_base < length; i_base++) {
- if (sequence1[i_base] != sequence2[i_base])
+ if (sequence1[i_base] != sequence2[i_base]) {
mismatchbasecount++;
+ }
}
- if (mismatchbasecount <= threshold)
+ if (mismatchbasecount <= threshold) {
return 1;
- else
+ } else {
return -1;
+ }
}
int
@@ -144,6 +188,21 @@ locate_mismatch_barcode(char *a_barcode){
}
int
+locate_mismatch_barcode_paired(char *a_barcode, char *a_barcode_rev){
+ int i;
+ int match_index = -1;
+ for (i = 1; i <= num_barcode; i++){
+ if ((Valid_Match(a_barcode, barcodes[i]->sequence, barcode_length, barcode_n_mismatch) > 0) &&
+ (Valid_Match(a_barcode_rev, barcodes[i]->sequenceRev, barcode_length_rev, barcode_n_mismatch) > 0)) {
+ match_index = barcodes[i]->original_pos;
+ break;
+ }
+ }
+ return match_index;
+}
+
+
+int
locate_mismatch_hairpin(char *a_hairpin){
int i;
int match_index = -1;
@@ -157,13 +216,75 @@ locate_mismatch_hairpin(char *a_hairpin){
}
+int
+locate_hairpin(char *a_hairpin, char *read, int doMismatch){
+ int hairpin_index;
+ if (doMismatch > 0){
+ hairpin_index = locate_mismatch_hairpin(a_hairpin);
+ } else {
+ hairpin_index = locate_hairpin_impl(a_hairpin);
+ }
+
+ if ((hairpin_index <= 0) && (allow_shifting > 0)) {
+ if ((doMismatch > 0) && (allow_shifted_mismatch <= 0)) {
+ return hairpin_index;
+ exit(0);
+ }
+ // Check if given hairpin can be mapped to a shifted location.
+ char *shiftedharipinstr;
+ shiftedharipinstr = (char *)malloc(hairpin_length * sizeof(char));
+
+ int index;
+ // check shifting leftwards
+ for (index = 1; index <= shifting_n_base; index++){
+ strncpy(shiftedharipinstr, read + hairpin_start - 1 - index, hairpin_length);
+ if (doMismatch > 0){
+ hairpin_index = locate_mismatch_hairpin(shiftedharipinstr);
+ } else {
+ hairpin_index = locate_hairpin_impl(shiftedharipinstr);
+ }
+ if (hairpin_index > 0) {
+ break;
+ }
+ }
+ // check shifting rightwards
+ if (hairpin_index <= 0){
+ for (index = 1; index <= shifting_n_base; index++){
+ strncpy(shiftedharipinstr, read + hairpin_start - 1 + index, hairpin_length);
+
+ if (doMismatch > 0){
+ hairpin_index = locate_mismatch_hairpin(shiftedharipinstr);
+ } else {
+ hairpin_index = locate_hairpin_impl(shiftedharipinstr);
+ }
+
+ if (hairpin_index > 0) {
+ break;
+ }
+ }
+ }
+ }
+ return hairpin_index;
+}
+
+int
+barcode_compare(a_barcode *barcode1, a_barcode *barcode2){
+ int ans;
+ ans = strncmp(barcode1->sequence, barcode2->sequence, barcode_length);
+ if ((ans == 0) && (isPairedReads > 0)){
+ ans = strncmp(barcode1->sequenceRev, barcode2->sequenceRev, barcode_length_rev);
+ }
+
+ return ans;
+}
+
void
Sort_Barcodes(void){
int i, j;
a_barcode *temp;
for(i = 1; i < num_barcode; i++){
for(j = i+1; j <= num_barcode; j++){
- if (strcmp(barcodes[i]->sequence, barcodes[j]->sequence) > 0){
+ if (barcode_compare(barcodes[i], barcodes[j]) > 0) {
temp = barcodes[i];
barcodes[i] = barcodes[j];
barcodes[j] = temp;
@@ -187,15 +308,19 @@ Read_In_Hairpins(char *filename){
while ((readline = fgets(line, len, fin)) != NULL){
count++;
new_hairpin = (a_hairpin *)malloc(sizeof(a_hairpin));
- new_hairpin->sequence = (char *)malloc(SEQ_LEN * sizeof(char));
+ new_hairpin->sequence = (char *)malloc(hairpin_length * sizeof(char));
new_hairpin->original_pos = count;
- new_hairpin->count = 0;
strncpy(new_hairpin->sequence, line, hairpin_length);
hairpins[count] = new_hairpin;
}
fclose(fin);
num_hairpin = count;
free(line);
+
+ int i_hairpin;
+ for (i_hairpin = 1; i_hairpin <= num_hairpin; i_hairpin++){
+ hairpinreadcount[i_hairpin] = 0;
+ }
Rprintf(" -- Number of Hairpins : %d\n", num_hairpin);
}
@@ -205,7 +330,7 @@ Sort_Hairpins(void){
a_hairpin *temp;
for(i = 1; i < num_hairpin; i++){
for(j = i+1; j <= num_hairpin; j++){
- if (strcmp(hairpins[i]->sequence, hairpins[j]->sequence) > 0){
+ if (strncmp(hairpins[i]->sequence, hairpins[j]->sequence, hairpin_length) > 0){
temp = hairpins[i];
hairpins[i] = hairpins[j];
hairpins[j] = temp;
@@ -214,108 +339,109 @@ Sort_Hairpins(void){
}
}
-long Count_Reads(char *filename) {
- FILE *freads = fopen(filename, "r");
- char * line = NULL;
- char *readline;
- size_t len = 1000;
- line = (char *)malloc(sizeof(char) * (len + 1));
- if(freads == NULL){
- fclose(freads);
- return 0;
- }
- long line_count = 0;
- while ((readline = fgets(line, len, freads)) != NULL){
- line_count++;
- }
- fclose(freads);
- free(line);
- return line_count / 4;
-}
-
-
void
-Process_Hairpin_Reads(char *filename){
- FILE *fin;
+Process_Hairpin_Reads(char *filename, char *filename2){
+ FILE *fin = NULL;
+ FILE *finRev = NULL;
char * line = NULL;
+ char * line2 = NULL;
size_t len = 1000;
char *readline;
+ char *readline2;
long num_read_thisfile = 0;
- fin = fopen(filename,"r");
+ char *this_barcode_for = NULL;
+ char *this_barcode_rev = NULL;
+ char *this_hairpin = NULL;
+
line = (char *)malloc(sizeof(char) * (len+1));
-
- if (isverbose)
- Rprintf("Processing reads in %s.\n", filename);
+ fin = fopen(filename,"r");
+ if (isPairedReads > 0) {
+ finRev = fopen(filename2, "r");
+ line2 = (char *)malloc(sizeof(char) * (len+1));
+ }
- char * this_barcode;
- char * this_hairpin;
- this_barcode = (char *)malloc(SEQ_LEN * sizeof(char));
- this_hairpin = (char *)malloc(SEQ_LEN * sizeof(char));
+ if (isverbose > 0){
+ if (isPairedReads > 0) {
+ Rprintf("Processing reads in %s and %s.\n", filename, filename2);
+ } else {
+ Rprintf("Processing reads in %s.\n", filename);
+ }
+ }
+ this_barcode_for = (char *)malloc(barcode_length * sizeof(char));
+ if (isPairedReads > 0) {
+ this_barcode_rev = (char *)malloc(barcode_length_rev * sizeof(char));
+ }
+ this_hairpin = (char *)malloc(hairpin_length * sizeof(char));
long line_count = 0;
int barcode_index;
int hairpin_index;
while ((readline = fgets(line, len, fin)) != NULL){
+ if (isPairedReads > 0) {
+ readline2 = fgets(line2, len, finRev);
+ if (readline2 == NULL) {
+ break;
+ }
+ }
line_count++;
- if ((line_count % 4) != 2)
+ if ((line_count % 4) != 2) {
continue;
-
- if ((isverbose) && (num_read_thisfile % BLOCKSIZE == 0))
+ }
+
+ if ((isverbose > 0) && (num_read_thisfile % BLOCKSIZE == 0)) {
Rprintf(" -- Processing %d million reads\n", (num_read_thisfile / BLOCKSIZE + 1) * 10);
+ }
num_read++;
num_read_thisfile++;
- strncpy(this_barcode, line + barcode_start - 1, barcode_length);
- barcode_index = locate_barcode(this_barcode);
+
+ strncpy(this_barcode_for, line + barcode_start - 1, barcode_length);
+ if (isPairedReads > 0){
+ strncpy(this_barcode_rev, line2 + barcode_start_rev - 1, barcode_length_rev);
+ barcode_index = locate_barcode_paired(this_barcode_for, this_barcode_rev);
+ } else {
+ barcode_index = locate_barcode(this_barcode_for);
+ }
strncpy(this_hairpin, line + hairpin_start - 1, hairpin_length);
- hairpin_index = locate_hairpin(this_hairpin);
-
- if ((hairpin_index <= 0) && (allow_shifting > 0)){
- // Check if given hairpin can be mapped to a shifted location.
- int index;
- // check shifting leftwards
- for (index = 1; index <= shifting_n_base; index++){
- strncpy(this_hairpin, line + hairpin_start - 1 - index, hairpin_length);
- hairpin_index = locate_hairpin(this_hairpin);
- if (hairpin_index > 0)
- break;
- }
- // check shifting rightwards
- if (hairpin_index <= 0){
- for (index = 1; index <= shifting_n_base; index++){
- strncpy(this_hairpin, line + hairpin_start - 1 + index, hairpin_length);
- hairpin_index = locate_hairpin(this_hairpin);
- if (hairpin_index > 0)
- break;
- }
- }
- }
+ hairpin_index = locate_hairpin(this_hairpin, line, 0); // not allowing mismatch
- if (barcode_index > 0)
+ if (barcode_index > 0){
barcodecount++;
+ }
if (hairpin_index > 0){
hairpincount++;
- hairpins[hairpin_index]->count++;
+ hairpinreadcount[hairpin_index]++;
}
if ((barcode_index > 0) && (hairpin_index > 0)) {
summary[hairpin_index][barcode_index]++;
bchpcount++;
}
-
- barcodeindex[num_read] = barcode_index;
- hairpinindex[num_read] = hairpin_index;
+
+ } // end while
+
+ if (isverbose > 0) {
+ if (isPairedReads > 0) {
+ Rprintf("Number of reads in file %s and %s: %ld\n", filename, filename2, num_read_thisfile);
+ } else {
+ Rprintf("Number of reads in file %s : %ld\n", filename, num_read_thisfile);
+ }
}
- if (isverbose)
- Rprintf("Number of reads in file %s : %ld\n", filename, num_read_thisfile);
- fclose(fin);
+
+ fclose(fin);
free(line);
- free(this_barcode);
- free(this_hairpin);
+ free(this_barcode_for);
+ free(this_hairpin);
+
+ if (isPairedReads > 0){
+ fclose(finRev);
+ free(line2);
+ free(this_barcode_rev);
+ }
}
void
@@ -324,7 +450,7 @@ Create_Mismatch_Hairpins_List(void){
num_mismatch_hairpin = 0;
for (i = 1; i <= num_hairpin; i++){
- if (hairpins[i]->count == 0){
+ if (hairpinreadcount[i] == 0){
num_mismatch_hairpin++;
mismatch_hairpins[num_mismatch_hairpin] = hairpins[i];
}
@@ -334,98 +460,132 @@ Create_Mismatch_Hairpins_List(void){
void
-Process_Mismatch(char *filename){
+Process_Mismatch(char *filename, char *filename2){
- FILE *fin;
+ FILE *fin = NULL;
+ FILE *finRev = NULL;
char * line = NULL;
+ char * line2 = NULL;
size_t len = 1000;
char *readline;
+ char *readline2;
long num_read_thisfile = 0;
- fin = fopen(filename,"r");
- line = (char *)malloc(len+1);
+ char * this_hairpin;
+ char * this_barcode_for;
+ char * this_barcode_rev = NULL;
- if (isverbose)
- Rprintf("Processing reads in %s, considering sequence mismatch. \n", filename);
+ line = (char *)malloc(len+1);
+ fin = fopen(filename,"r");
+ if (isPairedReads > 0) {
+ finRev = fopen(filename2, "r");
+ line2 = (char *)malloc(sizeof(char) * (len+1));
+ }
+ if (isverbose > 0){
+ if (isPairedReads > 0) {
+ Rprintf("Re-processing reads in %s and %s, considering sequence mismatch\n", filename, filename2);
+ } else {
+ Rprintf("Re-processing reads in %s, considering sequence mismatch\n", filename);
+ }
+ }
- char * this_hairpin;
- char * this_barcode;
- this_hairpin = (char *)malloc(SEQ_LEN * sizeof(char));
- this_barcode = (char *)malloc(SEQ_LEN * sizeof(char));
+ this_barcode_for = (char *)malloc(barcode_length * sizeof(char));
+ if (isPairedReads > 0) {
+ this_barcode_rev = (char *)malloc(barcode_length_rev * sizeof(char));
+ }
+ this_hairpin = (char *)malloc(hairpin_length * sizeof(char));
long line_count = 0;
+ int barcode_index;
+ int hairpin_index;
+
int new_barcode_index;
int new_hairpin_index;
+
while ((readline = fgets(line, len, fin)) != NULL){
+ if (isPairedReads > 0) {
+ readline2 = fgets(line2, len, finRev);
+ if (readline2 == NULL) {
+ break;
+ }
+ }
+
line_count++;
- if ((line_count % 4) != 2)
+ if ((line_count % 4) != 2) {
continue;
+ }
- if ((isverbose) && (num_read_thisfile % BLOCKSIZE == 0))
+ if ((isverbose > 0) && (num_read_thisfile % BLOCKSIZE == 0)) {
Rprintf(" -- Processing %d million reads\n", (num_read_thisfile / BLOCKSIZE + 1) * 10);
+ }
num_read++;
num_read_thisfile++;
- // only re-process reads withougt perfect hairpin match or without perfect barcode match;
- if ((hairpinindex[num_read] > 0) && (barcodeindex[num_read] > 0))
+ // re-do the mapping in Process_Hairpin_Reads()
+ strncpy(this_barcode_for, line + barcode_start - 1, barcode_length);
+ if (isPairedReads > 0){
+ strncpy(this_barcode_rev, line2 + barcode_start_rev - 1, barcode_length_rev);
+ barcode_index = locate_barcode_paired(this_barcode_for, this_barcode_rev);
+ } else {
+ barcode_index = locate_barcode(this_barcode_for);
+ }
+
+ strncpy(this_hairpin, line + hairpin_start - 1, hairpin_length);
+ hairpin_index = locate_hairpin(this_hairpin, line, 0); //not allowing mismatch
+
+ // only re-process reads without perfect hairpin match or without perfect barcode match;
+ if ((barcode_index > 0) && (hairpin_index > 0)) {
continue;
+ }
- // re-match barcode:
- if (barcodeindex[num_read] <= 0){
- strncpy(this_barcode, line + barcode_start - 1, barcode_length);
- new_barcode_index = locate_mismatch_barcode(this_barcode);
- if (new_barcode_index > 0)
- barcodecount++;
+ if (barcode_index > 0) {
+ new_barcode_index = barcode_index;
} else {
- new_barcode_index = barcodeindex[num_read];
- }
+ if (isPairedReads > 0){
+ new_barcode_index = locate_mismatch_barcode_paired(this_barcode_for, this_barcode_rev);
+ } else {
+ new_barcode_index = locate_mismatch_barcode(this_barcode_for);
+ }
+
+ if (new_barcode_index > 0) {
+ barcodecount++;
+ }
+ }
// re-match hairpin:
- if (hairpinindex[num_read] <= 0){
- strncpy(this_hairpin, line + hairpin_start - 1, hairpin_length);
- new_hairpin_index = locate_mismatch_hairpin(this_hairpin);
- if ((new_hairpin_index <= 0) && (allow_shifting > 0) && (allow_shifted_mismatch > 0)){
- // Check if given hairpin can be mapped to a shifted location.
- int index;
- // check shifting leftwards
- for (index = 1; index <= shifting_n_base; index++){
- strncpy(this_hairpin, line + hairpin_start - 1 - index, hairpin_length);
- new_hairpin_index = locate_mismatch_hairpin(this_hairpin);
- if (new_hairpin_index > 0)
- break;
- }
- // check shifting rightwards
- if (new_hairpin_index <= 0){
- for (index = 1; index <= shifting_n_base; index++){
- strncpy(this_hairpin, line + hairpin_start - 1 + index, hairpin_length);
- new_hairpin_index = locate_mismatch_hairpin(this_hairpin);
- if (new_hairpin_index > 0)
- break;
- }
- }
- }
- if (new_hairpin_index > 0)
- hairpincount++;
- } else {
- new_hairpin_index = hairpinindex[num_read];
- }
+ if (hairpin_index > 0){
+ new_hairpin_index = hairpin_index;
+ } else {
+ new_hairpin_index = locate_hairpin(this_hairpin, line, 1);
- if ((new_barcode_index > 0) && (new_hairpin_index > 0)) {
- summary[new_hairpin_index][new_barcode_index]++;
- bchpcount++;
- }
+ if (new_hairpin_index > 0) {
+ hairpincount++;
+ }
+ }
+
+ if ((new_barcode_index > 0) && (new_hairpin_index > 0)) {
+ summary[new_hairpin_index][new_barcode_index]++;
+ bchpcount++;
+ }
+ } // end while
- }
- fclose(fin);
+ fclose(fin);
free(line);
- free(this_barcode);
- free(this_hairpin);
+ free(this_barcode_for);
+ free(this_hairpin);
+
+ if (isPairedReads > 0){
+ fclose(finRev);
+ free(line2);
+ free(this_barcode_rev);
+ }
+
}
void
-Initialise(int barcodestart, int barcodeend, int hairpinstart, int hairpinend,
+Initialise(int IsPaired, int barcodestart, int barcodeend, int barcodestartrev, int barcodeendrev, int hairpinstart, int hairpinend,
int allowshifting, int shiftingnbase,
int allowMismatch, int barcodemismatch, int hairpinmismatch,
int allowShiftedMismatch, int verbose){
@@ -438,11 +598,15 @@ Initialise(int barcodestart, int barcodeend, int hairpinstart, int hairpinend,
num_barcode = 0;
num_hairpin = 0;
+ isPairedReads = IsPaired;
barcode_start = barcodestart;
barcode_end = barcodeend;
+ barcode_start_rev = barcodestartrev;
+ barcode_end_rev = barcodeendrev;
hairpin_start = hairpinstart;
hairpin_end = hairpinend;
barcode_length = barcode_end - barcode_start + 1;
+ barcode_length_rev = barcode_end_rev - barcode_start_rev + 1;
hairpin_length = hairpin_end - hairpin_start + 1;
allow_shifting = allowshifting;
@@ -493,6 +657,9 @@ Clean_Up(void){
int index;
for (index = 1; index <= num_barcode; index++){
free(barcodes[index]->sequence);
+ if (isPairedReads > 0){
+ free(barcodes[index]->sequenceRev);
+ }
free(barcodes[index]);
}
for (index = 1; index <= num_hairpin; index++){
@@ -501,38 +668,35 @@ Clean_Up(void){
}
}
+
void
-processHairpinReads(char **file, int *filecount,
+processHairpinReads(int *isPairedReads,
+ char **file, char **file2, int *filecount,
char**barcodeseqs, char**hairpinseqs,
- int *barcodestart, int *barcodeend, int *hairpinstart, int *hairpinend,
+ int *barcodestart, int *barcodeend, int *barcodestartrev, int *barcodeendrev, int *hairpinstart, int *hairpinend,
int *allowShifting, int *shiftingnbase,
int *allowMismatch, int *barcodemismatch, int *hairpinmismatch,
int *allowShiftedMismatch,
char **output, int *verbose)
{
- Initialise(*barcodestart, *barcodeend, *hairpinstart, *hairpinend,
+ Initialise(*isPairedReads, *barcodestart, *barcodeend, *barcodestartrev, *barcodeendrev, *hairpinstart, *hairpinend,
*allowShifting, *shiftingnbase,
*allowMismatch, *barcodemismatch, *hairpinmismatch,
*allowShiftedMismatch, *verbose);
Read_In_Barcodes(*barcodeseqs);
- Sort_Barcodes();
+ Sort_Barcodes(); // () is necessary to invoke function without parameters
Read_In_Hairpins(*hairpinseqs);
- Check_Hairpins();
- Sort_Hairpins();
- long totalreads;
+ Check_Hairpins();
+ Sort_Hairpins();
+
int i_file;
- totalreads = 0;
+
for (i_file = 0; i_file < *filecount; i_file++){
- totalreads = totalreads + Count_Reads(file[i_file]);
+ Process_Hairpin_Reads(file[i_file], file2[i_file]);
}
- barcodeindex = (int *)malloc(sizeof(int) * totalreads);
- hairpinindex = (int *)malloc(sizeof(int) * totalreads);
- for (i_file = 0; i_file < *filecount; i_file++){
- Process_Hairpin_Reads(file[i_file]);
- }
if (allow_mismatch > 0){
num_read = 0;
@@ -540,13 +704,20 @@ processHairpinReads(char **file, int *filecount,
Create_Mismatch_Hairpins_List();
if (num_mismatch_hairpin > 0){
for (i_file = 0; i_file < *filecount; i_file++){
- Process_Mismatch(file[i_file]);
+ if (isPairedReads > 0) {
+ Process_Mismatch(file[i_file], file2[i_file]);
+ } else {
+ Process_Mismatch(file[i_file], NULL);
+ }
}
}
}
Rprintf("\nThe input run parameters are: \n");
Rprintf(" -- Barcode: start position %d\t end position %d\t length %d\n", barcode_start, barcode_end, barcode_length);
+ if (isPairedReads > 0){
+ Rprintf(" -- Barcode in reverse read: start position %d\t end position %d\t length %d\n", barcode_start_rev, barcode_end_rev, barcode_length_rev);
+ }
Rprintf(" -- Hairpin: start position %d\t end position %d\t length %d\n", hairpin_start, hairpin_end, hairpin_length);
if (allow_shifting) {
Rprintf(" -- Allow hairpin sequences to be matched to a shifted position, <= %d base left or right of the specified positions. \n", shifting_n_base);
@@ -557,7 +728,7 @@ processHairpinReads(char **file, int *filecount,
Rprintf(" -- Allow sequence mismatch, <= %d base in barcode sequence and <= %d base in hairpin sequence. \n", barcode_n_mismatch, hairpin_n_mismatch );
} else {
Rprintf(" -- Mismatch in barcode/hairpin sequences not allowed. \n");
- }
+ }
Rprintf("\nTotal number of read is %ld \n", num_read);
Rprintf("There are %ld reads (%.4f percent) with barcode matches\n", barcodecount, 100.0*barcodecount/num_read);
@@ -565,7 +736,13 @@ processHairpinReads(char **file, int *filecount,
Rprintf("There are %ld reads (%.4f percent) with both barcode and hairpin matches\n", bchpcount, 100.0*bchpcount/num_read);
Output_Summary_Table(*output);
- free(barcodeindex);
- free(hairpinindex);
+
+ Clean_Up();
}
+
+
+
+
+
+
diff --git a/src/R_simple_good_turing.cpp b/src/R_simple_good_turing.cpp
index 5e0e458..f7eb674 100644
--- a/src/R_simple_good_turing.cpp
+++ b/src/R_simple_good_turing.cpp
@@ -8,16 +8,14 @@
#include "utils.h"
-extern "C" {
-
SEXP R_simple_good_turing (SEXP obs, SEXP freq, SEXP conf) try {
- const double confid_factor=NUMERIC_VALUE(conf);
- if (!IS_INTEGER(obs)) { throw std::runtime_error("observations vector must be integral"); }
- if (!IS_INTEGER(freq)) { throw std::runtime_error("frequencies vector must be integral"); }
+ const double confid_factor=asReal(conf);
+ if (!isInteger(obs)) { throw std::runtime_error("observations vector must be integral"); }
+ if (!isInteger(freq)) { throw std::runtime_error("frequencies vector must be integral"); }
const int rows=LENGTH(obs);
if (rows!=LENGTH(freq)) { throw std::runtime_error("length of vectors must match"); }
- int* optr=INTEGER_POINTER(obs);
- int* fptr=INTEGER_POINTER(freq);
+ int* optr=INTEGER(obs);
+ int* fptr=INTEGER(freq);
// Prefilling various data structures.
double bigN=0;
@@ -31,7 +29,7 @@ SEXP R_simple_good_turing (SEXP obs, SEXP freq, SEXP conf) try {
// Computing log data.
const int& x=(i==0 ? 0 : optr[i-1]);
- const double& logO=(log_obs[i]=std::log(optr[i]));
+ const double& logO=(log_obs[i]=std::log(double(optr[i])));
const double logZ=std::log(2*fptr[i]/double(i==last ? 2*(optr[i]-x) : optr[i+1]-x));
meanX+=logO;
meanY+=logZ;
@@ -51,10 +49,10 @@ SEXP R_simple_good_turing (SEXP obs, SEXP freq, SEXP conf) try {
const double& PZero = ((rows==0 || optr[0]!=1) ? 0 : (fptr[0] / double(bigN)));
// Setting up the output vector.
- SEXP output=PROTECT(NEW_LIST(2));
+ SEXP output=PROTECT(allocVector(VECSXP, 2));
SET_VECTOR_ELT(output, 0, ScalarReal(PZero));
- SET_VECTOR_ELT(output, 1, NEW_NUMERIC(rows));
- double* out_ptr=NUMERIC_POINTER(VECTOR_ELT(output, 1));
+ SET_VECTOR_ELT(output, 1, allocVector(REALSXP, rows));
+ double* out_ptr=REAL(VECTOR_ELT(output, 1));
try{
// Collecting results.
@@ -62,7 +60,7 @@ SEXP R_simple_good_turing (SEXP obs, SEXP freq, SEXP conf) try {
bool indiffValsSeen=false;
for (long i=0; i<rows; ++i) {
const int next_obs=optr[i]+1;
- const double y = next_obs*std::exp(slope*(std::log(next_obs)-log_obs[i])); // don't need intercept, cancels out.
+ const double y = next_obs*std::exp(slope*(std::log(double(next_obs))-log_obs[i])); // don't need intercept, cancels out.
// std::cout << "y for " << i << " is " << y << std::endl;
if (i==last || optr[i+1]!=next_obs) { indiffValsSeen=true; }
if (!indiffValsSeen) {
@@ -98,5 +96,3 @@ SEXP R_simple_good_turing (SEXP obs, SEXP freq, SEXP conf) try {
} catch (std::exception& e) {
return mkString(e.what());
}
-
-}
diff --git a/src/glm.h b/src/glm.h
index 4b59a72..5fbac85 100644
--- a/src/glm.h
+++ b/src/glm.h
@@ -8,7 +8,7 @@ std::pair<double,bool> glm_one_group(const int&, const int&, const double&, cons
#ifdef WEIGHTED
const double*,
#endif
- const double*, const double&);
+ const double*, const double&, double);
class glm_levenberg {
public:
diff --git a/src/glm_one_group.cpp b/src/glm_one_group.cpp
index 02b7e28..8daa626 100644
--- a/src/glm_one_group.cpp
+++ b/src/glm_one_group.cpp
@@ -4,36 +4,46 @@ std::pair<double,bool> glm_one_group(const int& nlibs, const int& maxit, const d
#ifdef WEIGHTED
const double* weights,
#endif
- const double* y,
- const double& disp) {
+ const double* y, const double& disp, double cur_beta) {
/* Setting up initial values for beta as the log of the mean of the ratio of counts to offsets.
* This is the exact solution for the gamma distribution (which is the limit of the NB as
- * the dispersion goes to infinity.
+ * the dispersion goes to infinity. However, if cur_beta is not NA, then we assume it's good.
*/
bool nonzero=false;
- double cur_beta=0, totweight=0;
- for (int j=0; j<nlibs; ++j) {
- const double& cur_val=y[j];
- if (cur_val>low_value) {
+ if (ISNA(cur_beta)) {
+ cur_beta=0;
+#ifdef WEIGHTED
+ double totweight=0;
+#else
+ double totweight=nlibs;
+#endif
+ for (int j=0; j<nlibs; ++j) {
+ const double& cur_val=y[j];
+ if (cur_val>low_value) {
#ifdef WEIGHTED
- cur_beta+=cur_val/std::exp(offset[j]) * weights[j];
+ cur_beta+=cur_val/std::exp(offset[j]) * weights[j];
#else
- cur_beta+=cur_val/std::exp(offset[j]);
+ cur_beta+=cur_val/std::exp(offset[j]);
#endif
- nonzero=true;
- }
+ nonzero=true;
+ }
#ifdef WEIGHTED
- totweight+=weights[j];
-#else
- ++totweight;
-#endif
+ totweight+=weights[j];
+#endif
+ }
+ cur_beta=std::log(cur_beta/totweight);
+ } else {
+ for (int j=0; j<nlibs; ++j) {
+ if (y[j] > low_value) { nonzero=true; break; }
+ }
}
+
+ // Skipping to a result for all-zero rows.
if (!nonzero) { return std::make_pair(R_NegInf, true); }
- // If we can't cop out of it, we'll do Newton-Raphson iterations instead.
+ // Newton-Raphson iterations to converge to mean.
bool has_converged=false;
double dl, info;
- cur_beta=std::log(cur_beta/totweight);
for (int i=0; i<maxit; ++i) {
dl=0;
info=0;
diff --git a/src/init.cpp b/src/init.cpp
new file mode 100644
index 0000000..d91e718
--- /dev/null
+++ b/src/init.cpp
@@ -0,0 +1,30 @@
+#include "utils.h"
+
+#define CALLDEF(name, n) {#name, (DL_FUNC) &name, n}
+
+extern "C" {
+
+static const R_CallMethodDef all_call_entries[] = {
+ CALLDEF(R_compute_nbdev, 3),
+ CALLDEF(R_cr_adjust, 3),
+ CALLDEF(R_exact_test_by_deviance, 5),
+ CALLDEF(R_levenberg, 11),
+ CALLDEF(R_loess_by_col, 4),
+ CALLDEF(R_maximize_interpolant, 2),
+ CALLDEF(R_one_group, 7),
+ CALLDEF(R_simple_good_turing, 3),
+ {NULL, NULL, 0}
+};
+
+R_CMethodDef all_c_entries[] = {
+ {"processHairpinReads", (DL_FUNC) &processHairpinReads, 20},
+ {NULL, NULL, 0}
+ };
+
+void attribute_visible R_init_edgeR(DllInfo *dll) {
+ R_registerRoutines(dll, all_c_entries, all_call_entries, NULL, NULL);
+ R_useDynamicSymbols(dll, FALSE);
+ R_forceSymbols(dll, TRUE);
+}
+
+}
diff --git a/src/matvec_check.cpp b/src/matvec_check.cpp
index 4c2ea17..6d5336a 100644
--- a/src/matvec_check.cpp
+++ b/src/matvec_check.cpp
@@ -16,13 +16,13 @@ matvec_check::matvec_check(const int nlib, const int nlen, SEXP incoming, const
return;
}
- if (!IS_NUMERIC(incoming)) {
+ if (!isNumeric(incoming)) {
failed << err << " vector or matrix should be double precision";
throw std::runtime_error(failed.str());
}
// Checking if it is a vector, matrix or transposed matrix.
- mycheck=NUMERIC_POINTER(incoming);
+ mycheck=REAL(incoming);
const int curlen=LENGTH(incoming);
if (curlen==0) {
// If it's empty, it's treated as a null.
diff --git a/src/utils.h b/src/utils.h
index 2d15018..98a690a 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -12,10 +12,38 @@
#include <sstream>
#include "R.h"
-#include "Rdefines.h"
+#include "Rinternals.h"
+#include "R_ext/Rdynload.h"
+#include "R_ext/Visibility.h"
#include "R_ext/BLAS.h"
#include "R_ext/Lapack.h"
const double low_value=std::pow(10.0, -10.0), log_low_value=std::log(low_value);
+/* Defining all R-accessible functions. */
+
+extern "C" {
+
+SEXP R_compute_nbdev(SEXP, SEXP, SEXP);
+
+SEXP R_cr_adjust (SEXP, SEXP, SEXP);
+
+SEXP R_exact_test_by_deviance(SEXP, SEXP, SEXP, SEXP, SEXP);
+
+SEXP R_levenberg (SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
+
+SEXP R_loess_by_col(SEXP, SEXP, SEXP, SEXP);
+
+SEXP R_maximize_interpolant(SEXP, SEXP);
+
+SEXP R_one_group (SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
+
+SEXP R_simple_good_turing (SEXP, SEXP, SEXP);
+
+void processHairpinReads(int *, char**, char**, int*,
+ char**, char**, int*, int*, int*, int*,
+ int*, int*, int*, int*, int*, int*,
+ int *, int *, char**, int*);
+}
+
#endif
diff --git a/tests/edgeR-Tests.R b/tests/edgeR-Tests.R
index c6dbb3a..bb189f4 100644
--- a/tests/edgeR-Tests.R
+++ b/tests/edgeR-Tests.R
@@ -75,6 +75,13 @@ summary(dglm$trended.dispersion)
dglm <- estimateGLMTagwiseDisp(dglm,design,prior.df=20)
summary(dglm$tagwise.dispersion)
+dglm2 <- estimateDisp(dglm, design)
+summary(dglm2$tagwise.dispersion)
+dglm2 <- estimateDisp(dglm, design, prior.df=20)
+summary(dglm2$tagwise.dispersion)
+dglm2 <- estimateDisp(dglm, design, robust=TRUE)
+summary(dglm2$tagwise.dispersion)
+
# Continuous trend
nlibs <- 3
ntags <- 1000
@@ -97,6 +104,13 @@ topTags(results)
d <- estimateGLMCommonDisp(d, design, verbose=TRUE)
glmFit(d,design,dispersion=dispersion.true, prior.count=0.5/3)
+d2 <- estimateDisp(d, design)
+summary(d2$tagwise.dispersion)
+d2 <- estimateDisp(d, design, prior.df=20)
+summary(d2$tagwise.dispersion)
+d2 <- estimateDisp(d, design, robust=TRUE)
+summary(d2$tagwise.dispersion)
+
# Exact tests
y <- matrix(rnbinom(20,mu=10,size=3/2),5,4)
group <- factor(c(1,1,2,2))
diff --git a/tests/edgeR-Tests.Rout.save b/tests/edgeR-Tests.Rout.save
index 8fc2197..b0c9742 100644
--- a/tests/edgeR-Tests.Rout.save
+++ b/tests/edgeR-Tests.Rout.save
@@ -1,12 +1,14 @@
-R version 3.1.0 (2014-04-10) -- "Spring Dance"
+R version 3.1.1 (2014-07-10) -- "Sock it to Me"
Copyright (C) 2014 The R Foundation for Statistical Computing
-Platform: x86_64-w64-mingw32/x64 (64-bit)
+Platform: i386-w64-mingw32/i386 (32-bit)
R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.
+ Natural language support but running in an English locale
+
R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.
@@ -180,7 +182,106 @@ $fitted.values
[9,] 4.5 4.5 5.5 5.5
[10,] 10.0 10.0 13.0 13.0
-> mglmOneWay(d[10.2000 0.3469
+> mglmOneWay(d[1:10,],design,dispersion=0)
+$coefficients
+ (Intercept) group2
+ [1,] -1.000000e+08 0.000000e+00
+ [2,] -1.000000e+08 1.000000e+08
+ [3,] 2.525729e+00 -5.108256e-01
+ [4,] 2.525729e+00 1.484200e-01
+ [5,] 2.140066e+00 -1.941560e-01
+ [6,] 2.079442e+00 -1.163151e+00
+ [7,] 2.014903e+00 2.363888e-01
+ [8,] 1.945910e+00 -5.596158e-01
+ [9,] 1.504077e+00 2.006707e-01
+[10,] 2.302585e+00 2.623643e-01
+
+$fitted.values
+ [,1] [,2] [,3] [,4]
+ [1,] 0.0 0.0 0.0 0.0
+ [2,] 0.0 0.0 2.0 2.0
+ [3,] 12.5 12.5 7.5 7.5
+ [4,] 12.5 12.5 14.5 14.5
+ [5,] 8.5 8.5 7.0 7.0
+ [6,] 8.0 8.0 2.5 2.5
+ [7,] 7.5 7.5 9.5 9.5
+ [8,] 7.0 7.0 4.0 4.0
+ [9,] 4.5 4.5 5.5 5.5
+[10,] 10.0 10.0 13.0 13.0
+
+>
+> fit <- glmFit(d,design,dispersion=d$common.dispersion,prior.count=0.5/4)
+> lrt <- glmLRT(fit,coef=2)
+> topTags(lrt)
+Coefficient: group2
+ logFC logCPM LR PValue FDR
+Tag.17 2.0450964 13.73726 6.0485417 0.01391779 0.3058698
+Tag.2 4.0861092 11.54121 4.8400340 0.02780635 0.3058698
+Tag.21 -1.7265870 13.38327 4.0777825 0.04345065 0.3186381
+Tag.6 -1.6329986 12.81479 3.0078205 0.08286364 0.4557500
+Tag.16 0.9324996 13.57074 1.3477682 0.24566867 0.8276702
+Tag.20 0.8543138 13.76364 1.1890032 0.27553071 0.8276702
+Tag.19 -0.7976602 13.31405 0.9279151 0.33540526 0.8276702
+Tag.12 0.7081170 14.31389 0.9095513 0.34023349 0.8276702
+Tag.3 -0.7300410 13.54155 0.8300307 0.36226364 0.8276702
+Tag.8 -0.7917906 12.86353 0.7830377 0.37621371 0.8276702
+>
+> fit <- glmFit(d,design,dispersion=d$common.dispersion,prior.count=0.5)
+> summary(fit$coef)
+ (Intercept) group2
+ Min. :-7.604 Min. :-1.13681
+ 1st Qu.:-4.895 1st Qu.:-0.32341
+ Median :-4.713 Median : 0.15083
+ Mean :-4.940 Mean : 0.07817
+ 3rd Qu.:-4.524 3rd Qu.: 0.35163
+ Max. :-4.107 Max. : 1.60864
+>
+> fit <- glmFit(d,design,prior.count=0.5/4)
+> lrt <- glmLRT(fit,coef=2)
+> topTags(lrt)
+Coefficient: group2
+ logFC logCPM LR PValue FDR
+Tag.17 2.0450964 13.73726 6.0485417 0.01391779 0.3058698
+Tag.2 4.0861092 11.54121 4.8400340 0.02780635 0.3058698
+Tag.21 -1.7265870 13.38327 4.0777825 0.04345065 0.3186381
+Tag.6 -1.6329986 12.81479 3.0078205 0.08286364 0.4557500
+Tag.16 0.9324996 13.57074 1.3477682 0.24566867 0.8276702
+Tag.20 0.8543138 13.76364 1.1890032 0.27553071 0.8276702
+Tag.19 -0.7976602 13.31405 0.9279151 0.33540526 0.8276702
+Tag.12 0.7081170 14.31389 0.9095513 0.34023349 0.8276702
+Tag.3 -0.7300410 13.54155 0.8300307 0.36226364 0.8276702
+Tag.8 -0.7917906 12.86353 0.7830377 0.37621371 0.8276702
+>
+> dglm <- estimateGLMCommonDisp(d,design)
+> dglm$common.dispersion
+[1] 0.2033282
+> dglm <- estimateGLMTagwiseDisp(dglm,design,prior.df=20)
+> summary(dglm$tagwise.dispersion)
+ Min. 1st Qu. Median Mean 3rd Qu. Max.
+ 0.1756 0.1879 0.1998 0.2031 0.2135 0.2578
+> fit <- glmFit(dglm,design,prior.count=0.5/4)
+> lrt <- glmLRT(fit,coef=2)
+> topTags(lrt)
+Coefficient: group2
+ logFC logCPM LR PValue FDR
+Tag.17 2.0450988 13.73726 6.8001118 0.009115216 0.2005348
+Tag.2 4.0861092 11.54121 4.8594088 0.027495756 0.2872068
+Tag.21 -1.7265904 13.38327 4.2537154 0.039164558 0.2872068
+Tag.6 -1.6329904 12.81479 3.1763761 0.074710253 0.4109064
+Tag.16 0.9324970 13.57074 1.4126709 0.234613512 0.8499599
+Tag.20 0.8543183 13.76364 1.2721097 0.259371274 0.8499599
+Tag.19 -0.7976614 13.31405 0.9190392 0.337727381 0.8499599
+Tag.12 0.7081163 14.31389 0.9014515 0.342392806 0.8499599
+Tag.3 -0.7300488 13.54155 0.8817937 0.347710872 0.8499599
+Tag.8 -0.7918166 12.86353 0.7356185 0.391068049 0.8603497
+> dglm <- estimateGLMTrendedDisp(dglm,design)
+> summary(dglm$trended.dispersion)
+ Min. 1st Qu. Median Mean 3rd Qu. Max.
+ 0.1522 0.1676 0.1740 0.1887 0.2000 0.3469
+> dglm <- estimateGLMTrendedDisp(dglm,design,method="power")
+> summary(dglm$trended.dispersion)
+ Min. 1st Qu. Median Mean 3rd Qu. Max.
+ 0.1522 0.1676 0.1740 0.1887 0.2000 0.3469
> dglm <- estimateGLMTrendedDisp(dglm,design,method="spline")
Loading required package: splines
> summary(dglm$trended.dispersion)
@@ -195,6 +296,20 @@ Loading required package: splines
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.1385 0.1792 0.1964 0.1935 0.2026 0.2709
>
+> dglm2 <- estimateDisp(dglm, design)
+> summary(dglm2$tagwise.dispersion)
+ Min. 1st Qu. Median Mean 3rd Qu. Max.
+ 0.1766 0.1789 0.1814 0.1846 0.1870 0.2119
+> dglm2 <- estimateDisp(dglm, design, prior.df=20)
+> summary(dglm2$tagwise.dispersion)
+ Min. 1st Qu. Median Mean 3rd Qu. Max.
+ 0.1527 0.1669 0.1814 0.1858 0.1951 0.2497
+> dglm2 <- estimateDisp(dglm, design, robust=TRUE)
+Loading required package: statmod
+> summary(dglm2$tagwise.dispersion)
+ Min. 1st Qu. Median Mean 3rd Qu. Max.
+ 0.1766 0.1789 0.1814 0.1846 0.1870 0.2119
+>
> # Continuous trend
> nlibs <- 3
> ntags <- 1000
@@ -272,6 +387,15 @@ Gene4 2 4 6
Gene5 1 1 9
995 more rows ...
+$unshrunk.coefficients
+ (Intercept) x
+Gene1 -7.437763 2.0412762
+Gene2 -7.373370 -0.8796273
+Gene3 -6.870127 -0.1465014
+Gene4 -7.552642 0.5410832
+Gene5 -8.972372 1.3929679
+995 more rows ...
+
$df.residual
[1] 1 1 1 1 1
995 more elements ...
@@ -296,6 +420,9 @@ $offset
$dispersion
[1] 0.1
+$prior.count
+[1] 0.1666667
+
$samples
group lib.size norm.factors
x0 1 4001 1.0008730
@@ -307,6 +434,19 @@ $AveLogCPM
995 more elements ...
>
+> d2 <- estimateDisp(d, design)
+> summary(d2$tagwise.dispersion)
+ Min. 1st Qu. Median Mean 3rd Qu. Max.
+0.05545 0.09511 0.11620 0.11010 0.13330 0.16860
+> d2 <- estimateDisp(d, design, prior.df=20)
+> summary(d2$tagwise.dispersion)
+ Min. 1st Qu. Median Mean 3rd Qu. Max.
+0.04203 0.08587 0.11280 0.11010 0.12370 0.37410
+> d2 <- estimateDisp(d, design, robust=TRUE)
+> summary(d2$tagwise.dispersion)
+ Min. 1st Qu. Median Mean 3rd Qu. Max.
+0.05545 0.09511 0.11620 0.11010 0.13330 0.16860
+>
> # Exact tests
> y <- matrix(rnbinom(20,mu=10,size=3/2),5,4)
> group <- factor(c(1,1,2,2))
@@ -421,53 +561,4 @@ $n0
>
> proc.time()
user system elapsed
- 3.41 0.03 3.44
- edgeR/vignettes/ 0000755 0001263 0001264 00000000000 12373265231 015177 5 [...]
-%\VignetteKeyword{RNA-Seq}
-%\VignetteKeyword{differential expression}
-%\VignettePackage{edgeR}
-\documentclass[12pt]{article}
-
-\textwidth=6.2in
-\textheight=8.5in
-\oddsidemargin=0.2in
-\evensidemargin=0.2in
-\headheight=0in
-\headsep=0in
-
-\begin{document}
-
-\title{edgeR Package Introduction}
-\author{Mark Robinson, Davis McCarthy, Yunshun Chen,\\
-Aaron Lun, Gordon K.\ Smyth}
-\date{10 October 2012\\
-Revised 10 November 2013}
-\maketitle
-
-
-edgeR is a package for the differential expression analysis of digital gene expression data,
-that is, of count data arising from DNA sequencing technologies.
-It is especially designed for differential expression analyses of RNA-Seq or SAGE data,
-or differential marking analyses of ChIP-Seq data.
-
-edgeR implements novel statistical methods based on the negative binomial distribution
-as a model for count variability, including empirical Bayes methods, exact tests, and generalized linear models.
-The package is especially suitable for analysing designed experiments with multiple
-experimental factors but possibly small numbers of replicates.
-It has unique abilities to model transcript specific variation even in small samples,
-a capability essential for prioritizing genes or transcripts that have consistent effects across replicates.
-
-The full edgeR User's Guide is available as part of the online documentation.
-To reach the User's Guide, install the edgeR package and load it into an R session by \texttt{library(edgeR)}.
-In R for Windows, the User's Guide will then be available from the drop-down menu called ``Vignettes''.
-In other operating systems, type
-\begin{Schunk}
-\begin{Sinput}
-> library(edgeR)
-> edgeRUsersGuide()
-\end{Sinput}
-\end{Schunk}
-at the R prompt to open the User's Guide in a pdf viewer.
-
-\end{document}
- [...]
\ No newline at end of file
+ 3.58 0.04 5.32
diff --git a/inst/doc/edgeR.Rnw b/vignettes/edgeR.Rnw
similarity index 93%
copy from inst/doc/edgeR.Rnw
copy to vignettes/edgeR.Rnw
index 34e205e..af5781e 100755
--- a/inst/doc/edgeR.Rnw
+++ b/vignettes/edgeR.Rnw
@@ -14,10 +14,10 @@
\begin{document}
\title{edgeR Package Introduction}
-\author{Mark Robinson, Davis McCarthy, Yunshun Chen,\\
-Aaron Lun, Gordon K.\ Smyth}
+\author{Yunshun Chen, Davis McCarthy, Aaron Lun,\\
+Xiaobei Zhou, Mark Robinson, Gordon K.\ Smyth}
\date{10 October 2012\\
-Revised 10 November 2013}
+Revised 8 October 2014}
\maketitle
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/r-bioc-edger.git
More information about the debian-med-commit
mailing list