[med-svn] [r-bioc-annotationfilter] 01/02: New upstream version 1.0.0
Andreas Tille
tille at debian.org
Fri Oct 13 12:39:24 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository r-bioc-annotationfilter.
commit d1fc0566bc79f995decf6e3b369bbffc71906f87
Author: Andreas Tille <tille at debian.org>
Date: Fri Oct 13 14:36:27 2017 +0200
New upstream version 1.0.0
---
DESCRIPTION | 32 ++
NAMESPACE | 73 ++++
NEWS | 7 +
NOTES.md | 8 +
R/AllGenerics.R | 9 +
R/AnnotationFilter.R | 426 +++++++++++++++++++++
R/AnnotationFilterList.R | 172 +++++++++
R/translate-utils.R | 120 ++++++
README | 26 ++
build/vignette.rds | Bin 0 -> 272 bytes
inst/doc/AnnotationFilter.R | 159 ++++++++
inst/doc/AnnotationFilter.Rmd | 404 ++++++++++++++++++++
inst/doc/AnnotationFilter.html | 571 +++++++++++++++++++++++++++++
man/AnnotationFilter.Rd | 230 ++++++++++++
man/AnnotationFilterList.Rd | 82 +++++
tests/testthat.R | 4 +
tests/testthat/test_AnnotationFilter.R | 75 ++++
tests/testthat/test_AnnotationFilterList.R | 52 +++
tests/testthat/test_translate-utils.R | 108 ++++++
vignettes/AnnotationFilter.Rmd | 404 ++++++++++++++++++++
20 files changed, 2962 insertions(+)
diff --git a/DESCRIPTION b/DESCRIPTION
new file mode 100644
index 0000000..e24bcf3
--- /dev/null
+++ b/DESCRIPTION
@@ -0,0 +1,32 @@
+Package: AnnotationFilter
+Title: Facilities for Filtering Bioconductor Annotation Resources
+Version: 1.0.0
+Authors at R: c( person("Martin", "Morgan", email =
+ "martin.morgan at roswellpark.org", role = "aut"),
+ person("Johannes", "Rainer", email =
+ "johannes.rainer at eurac.edu", role = "aut"),
+ person("Bioconductor", "Maintainer",
+ email="maintainer at bioconductor.org", role = "cre"))
+URL: https://github.com/Bioconductor/AnnotationFilter
+BugReports: https://github.com/Bioconductor/AnnotationFilter/issues
+Description: This package provides class and other infrastructure to
+ implement filters for manipulating Bioconductor annotation
+ resources. The filters will be used by ensembldb,
+ Organism.dplyr, and other packages.
+Depends: R (>= 3.4.0)
+Imports: utils, methods, GenomicRanges, lazyeval
+Suggests: BiocStyle, knitr, testthat, RSQLite, org.Hs.eg.db
+VignetteBuilder: knitr
+License: Artistic-2.0
+biocViews: Annotation, Infrastructure, Software
+Encoding: UTF-8
+LazyData: true
+RoxygenNote: 6.0.1
+Collate: 'AllGenerics.R' 'AnnotationFilter.R' 'AnnotationFilterList.R'
+ 'translate-utils.R'
+NeedsCompilation: no
+Packaged: 2017-04-25 01:05:17 UTC; biocbuild
+Author: Martin Morgan [aut],
+ Johannes Rainer [aut],
+ Bioconductor Maintainer [cre]
+Maintainer: Bioconductor Maintainer <maintainer at bioconductor.org>
diff --git a/NAMESPACE b/NAMESPACE
new file mode 100644
index 0000000..c17b25f
--- /dev/null
+++ b/NAMESPACE
@@ -0,0 +1,73 @@
+# Generated by roxygen2: do not edit by hand
+
+export(AnnotationFilter)
+export(AnnotationFilterList)
+export(CdsEndFilter)
+export(CdsStartFilter)
+export(EntrezFilter)
+export(ExonEndFilter)
+export(ExonIdFilter)
+export(ExonNameFilter)
+export(ExonRankFilter)
+export(ExonStartFilter)
+export(GRangesFilter)
+export(GeneBiotypeFilter)
+export(GeneEndFilter)
+export(GeneIdFilter)
+export(GeneStartFilter)
+export(GenenameFilter)
+export(ProteinIdFilter)
+export(SeqNameFilter)
+export(SeqStrandFilter)
+export(SymbolFilter)
+export(TxBiotypeFilter)
+export(TxEndFilter)
+export(TxIdFilter)
+export(TxNameFilter)
+export(TxStartFilter)
+export(UniprotFilter)
+export(feature)
+exportClasses(AnnotationFilter)
+exportClasses(AnnotationFilterList)
+exportClasses(CdsEndFilter)
+exportClasses(CdsStartFilter)
+exportClasses(CharacterFilter)
+exportClasses(EntrezFilter)
+exportClasses(ExonEndFilter)
+exportClasses(ExonIdFilter)
+exportClasses(ExonNameFilter)
+exportClasses(ExonRankFilter)
+exportClasses(ExonStartFilter)
+exportClasses(GRangesFilter)
+exportClasses(GeneBiotypeFilter)
+exportClasses(GeneEndFilter)
+exportClasses(GeneIdFilter)
+exportClasses(GeneStartFilter)
+exportClasses(GenenameFilter)
+exportClasses(IntegerFilter)
+exportClasses(ProteinIdFilter)
+exportClasses(SeqNameFilter)
+exportClasses(SeqStrandFilter)
+exportClasses(SymbolFilter)
+exportClasses(TxBiotypeFilter)
+exportClasses(TxEndFilter)
+exportClasses(TxIdFilter)
+exportClasses(TxNameFilter)
+exportClasses(TxStartFilter)
+exportClasses(UniprotFilter)
+exportMethods(condition)
+exportMethods(field)
+exportMethods(show)
+exportMethods(supportedFilters)
+exportMethods(value)
+importClassesFrom(GenomicRanges,GRanges)
+importFrom(GenomicRanges,GRanges)
+importFrom(GenomicRanges,show)
+importFrom(lazyeval,f_eval)
+importFrom(methods,callNextMethod)
+importFrom(methods,initialize)
+importFrom(methods,is)
+importFrom(methods,new)
+importFrom(methods,show)
+importFrom(methods,validObject)
+importFrom(utils,tail)
diff --git a/NEWS b/NEWS
new file mode 100644
index 0000000..43d5934
--- /dev/null
+++ b/NEWS
@@ -0,0 +1,7 @@
+CHANGES IN VERSION 0.99.5
+--------------------------
+
+NEW FEATURES
+
+ o Add convertFilterExpressionQuoted function.
+ o Add field method.
diff --git a/NOTES.md b/NOTES.md
new file mode 100644
index 0000000..f68e074
--- /dev/null
+++ b/NOTES.md
@@ -0,0 +1,8 @@
+# Development guidelines
+
+- roxygen2 documentation
+- testthat unit tests
+- file name correspondence between code `R/foo.R`, tests
+ `tests/testthat/test_foo.R`, and documentation `man/foo.Rd`.
+- version bump on master commit
+- commits to master pass R CMD build && R CMD check
diff --git a/R/AllGenerics.R b/R/AllGenerics.R
new file mode 100644
index 0000000..71be13d
--- /dev/null
+++ b/R/AllGenerics.R
@@ -0,0 +1,9 @@
+## Generic methods.
+setGeneric("condition", function(object, ...) standardGeneric("condition"))
+
+setGeneric("field", function(object, ...) standardGeneric("field"))
+
+setGeneric("value", function(object, ...) standardGeneric("value"))
+
+setGeneric("supportedFilters", function(object, ...)
+ standardGeneric("supportedFilters"))
diff --git a/R/AnnotationFilter.R b/R/AnnotationFilter.R
new file mode 100644
index 0000000..a5390ca
--- /dev/null
+++ b/R/AnnotationFilter.R
@@ -0,0 +1,426 @@
+#' @name AnnotationFilter
+#'
+#' @title Filters for annotation objects
+#'
+#' @aliases CdsStartFilter CdsEndFilter ExonIdFilter ExonNameFilter
+#' ExonStartFilter ExonEndFilter ExonRankFilter GeneIdFilter
+#' GenenameFilter GeneBiotypeFilter GeneStartFilter GeneEndFilter
+#' EntrezFilter SymbolFilter TxIdFilter TxNameFilter
+#' TxBiotypeFilter TxStartFilter TxEndFilter ProteinIdFilter
+#' UniprotFilter SeqNameFilter SeqStrandFilter
+#' AnnotationFilter-class CharacterFilter-class
+#' IntegerFilter-class CdsStartFilter-class CdsEndFilter-class
+#' ExonIdFilter-class ExonNameFilter-class ExonStartFilter-class
+#' ExonEndFilter-class ExonRankFilter-class GeneIdFilter-class
+#' GenenameFilter-class GeneBiotypeFilter-class
+#' GeneStartFilter-class GeneEndFilter-class EntrezFilter-class
+#' SymbolFilter-class TxIdFilter-class TxNameFilter-class
+#' TxBiotypeFilter-class TxStartFilter-class TxEndFilter-class
+#' ProteinIdFilter-class UniprotFilter-class SeqNameFilter-class
+#' SeqStrandFilter-class supportedFilters
+#' show,AnnotationFilter-method show,CharacterFilter-method
+#' show,IntegerFilter-method show,GRangesFilter-method
+#'
+#' @description
+#'
+#' The filters extending the base \code{AnnotationFilter} class
+#' represent a simple filtering concept for annotation resources.
+#' Each filter object is thought to filter on a single (database)
+#' table column using the provided values and the defined condition.
+#'
+#' Filter instances created using the constructor functions (e.g.
+#' \code{GeneIdFilter}).
+#'
+#' \code{supportedFilters()} lists all defined filters. Packages using
+#' \code{AnnotationFilter} should implement the \code{supportedFilters} for
+#' their annotation resource object (e.g. for \code{object = "EnsDb"} in the
+#' \code{ensembldb} package) to list all supported filters for the specific
+#' resource.
+#'
+#' @details
+#'
+#' By default filters are only available for tables containing the
+#' field on which the filter acts (i.e. that contain a column with the
+#' name matching the value of the \code{field} slot of the
+#' object). See the vignette for a description to use filters for
+#' databases in which the database table column name differs from the
+#' default \code{field} of the filter.
+#'
+#' @usage
+#'
+#' CdsStartFilter(value, condition = "==")
+#' CdsEndFilter(value, condition = "==")
+#' ExonIdFilter(value, condition = "==")
+#' ExonNameFilter(value, condition = "==")
+#' ExonRankFilter(value, condition = "==")
+#' ExonStartFilter(value, condition = "==")
+#' ExonEndFilter(value, condition = "==")
+#' GeneIdFilter(value, condition = "==")
+#' GenenameFilter(value, condition = "==")
+#' GeneBiotypeFilter(value, condition = "==")
+#' GeneStartFilter(value, condition = "==")
+#' GeneEndFilter(value, condition = "==")
+#' EntrezFilter(value, condition = "==")
+#' SymbolFilter(value, condition = "==")
+#' TxIdFilter(value, condition = "==")
+#' TxNameFilter(value, condition = "==")
+#' TxBiotypeFilter(value, condition = "==")
+#' TxStartFilter(value, condition = "==")
+#' TxEndFilter(value, condition = "==")
+#' ProteinIdFilter(value, condition = "==")
+#' UniprotFilter(value, condition = "==")
+#' SeqNameFilter(value, condition = "==")
+#' SeqStrandFilter(value, condition = "==")
+#'
+#' @param value \code{character()}, \code{integer()}, or
+#' \code{GRanges()} value for the filter
+#'
+#' @param condition \code{character(1)} defining the condition to be
+#' used in the filter. For \code{IntegerFilter}, one of
+#' \code{"=="}, \code{"!="}, \code{">"}, \code{"<"}, \code{">="}
+#' or \code{"<="}. For \code{CharacterFilter}, one of \code{"=="},
+#' \code{"!="}, \code{"startsWith"} or \code{"endsWith"}. Default
+#' condition is \code{"=="}.
+#'
+#' @return The constructor function return an object extending
+#' \code{AnnotationFilter}. For the return value of the other methods see
+#' the methods' descriptions.
+#'
+#' @seealso \code{\link{AnnotationFilterList}} for combining
+#' \code{AnnotationFilter} objects.
+NULL
+
+.CONDITION <- list(
+ IntegerFilter = c("==", "!=", ">", "<", ">=", "<="),
+ CharacterFilter = c("==", "!=", "startsWith", "endsWith"),
+ GRangesFilter = c("any", "start", "end", "within", "equal")
+)
+
+.FIELD <- list(
+ CharacterFilter = c(
+ "exon_id", "exon_name", "gene_id", "genename", "gene_biotype",
+ "entrez", "symbol", "tx_id", "tx_name", "tx_biotype",
+ "protein_id", "uniprot", "seq_name", "seq_strand"),
+ IntegerFilter = c(
+ "cds_start", "cds_end", "exon_start", "exon_rank", "exon_end",
+ "gene_start", "gene_end", "tx_start", "tx_end")
+)
+
+.valid_condition <- function(condition, class) {
+ txt <- character()
+
+ test0 <- length(condition) == 1L
+ if (!test0)
+ txt <- c(txt, "'condition' must be length 1")
+
+ test1 <- test0 && (condition %in% .CONDITION[[class]])
+ if (!test1) {
+ value <- paste(sQuote(.CONDITION[[class]]), collapse=" ")
+ txt <- c(txt, paste0("'", condition, "' must be in ", value))
+ }
+
+ if (length(txt)) txt else TRUE
+}
+
+############################################################
+## AnnotationFilter
+##
+
+#' @exportClass AnnotationFilter
+.AnnotationFilter <- setClass(
+ "AnnotationFilter",
+ contains = "VIRTUAL",
+ slots = c(
+ field="character",
+ condition="character",
+ value="ANY"
+ ),
+ prototype=list(
+ condition= "=="
+ )
+)
+
+setValidity("AnnotationFilter", function(object) {
+ txt <- character()
+
+ value <- .value(object)
+ condition <- .condition(object)
+ test_len <- length(condition) == 1L
+ test_NA <- !any(is.na(condition))
+
+ if (test_len && !test_NA)
+ txt <- c(txt, "'condition' can not be NA")
+ test0 <- test_len && test_NA
+
+ test1 <- condition %in% c("startsWith", "endsWith", ">", "<", ">=", "<=")
+ if (test0 && test1 && length(value) > 1L)
+ txt <- c(txt, paste0("'", condition, "' requires length 1 'value'"))
+
+ if (any(is.na(value)))
+ txt <- c(txt, "'value' can not be NA")
+
+ if (length(txt)) txt else TRUE
+})
+
+.field <- function(object) object at field
+
+.condition <- function(object) object at condition
+
+.value <- function(object) object at value
+
+#' @rdname AnnotationFilter
+#'
+#' @aliases condition
+#'
+#' @description \code{condition()} get the \code{condition} value for
+#' the filter \code{object}.
+#'
+#' @param object An \code{AnnotationFilter} object.
+#'
+#' @export
+setMethod("condition", "AnnotationFilter", .condition)
+
+#' @rdname AnnotationFilter
+#'
+#' @aliases value
+#'
+#' @description \code{value()} get the \code{value} for the filter
+#' \code{object}.
+#'
+#' @export
+setMethod("value", "AnnotationFilter", .value)
+
+#' @rdname AnnotationFilter
+#'
+#' @aliases field
+#'
+#' @description \code{field()} get the \code{field} for the filter
+#' \code{object}.
+#'
+#' @export
+setMethod("field", "AnnotationFilter", .field)
+
+#' @importFrom methods show
+#'
+#' @export
+setMethod("show", "AnnotationFilter", function(object){
+ cat("class:", class(object),
+ "\ncondition:", .condition(object), "\n")
+})
+
+############################################################
+## CharacterFilter, IntegerFilter
+##
+
+#' @exportClass CharacterFilter
+.CharacterFilter <- setClass(
+ "CharacterFilter",
+ contains = c("VIRTUAL", "AnnotationFilter"),
+ slots = c(value = "character"),
+ prototype = list(
+ value = character()
+ )
+)
+
+setValidity("CharacterFilter", function(object) {
+ .valid_condition(.condition(object), "CharacterFilter")
+})
+
+#' @importFrom methods show callNextMethod
+#'
+#' @export
+setMethod("show", "CharacterFilter", function(object) {
+ callNextMethod()
+ cat("value:", .value(object), "\n")
+})
+
+#' @exportClass IntegerFilter
+.IntegerFilter <- setClass(
+ "IntegerFilter",
+ contains = c("VIRTUAL", "AnnotationFilter"),
+ slots = c(value = "integer"),
+ prototype = list(
+ value = integer()
+ )
+)
+
+setValidity("IntegerFilter", function(object) {
+ .valid_condition(.condition(object), "IntegerFilter")
+})
+
+#' @export
+setMethod("show", "IntegerFilter", function(object) {
+ callNextMethod()
+ cat("value:", .value(object), "\n")
+})
+
+#' @rdname AnnotationFilter
+#'
+#' @importFrom GenomicRanges GRanges
+#'
+#' @importClassesFrom GenomicRanges GRanges
+#'
+#' @exportClass GRangesFilter
+.GRangesFilter <- setClass(
+ "GRangesFilter",
+ contains = "AnnotationFilter",
+ slots = c(
+ value = "GRanges",
+ feature = "character"
+ ),
+ prototype = list(
+ value = GRanges(),
+ condition = "any",
+ field = "granges",
+ feature = "gene"
+ )
+)
+
+setValidity("GRangesFilter", function(object) {
+ .valid_condition(.condition(object), "GRangesFilter")
+})
+
+.feature <- function(object) object at feature
+
+#' @rdname AnnotationFilter
+#'
+#' @param type \code{character(1)} indicating how overlaps are to be
+#' filtered. See \code{findOverlaps} in the IRanges package for a
+#' description of this argument.
+#'
+#' @examples
+#' ## filter by GRanges
+#' GRangesFilter(GenomicRanges::GRanges("chr10:87869000-87876000"))
+#' @export
+GRangesFilter <-
+ function(value, feature = "gene",
+ type = c("any", "start", "end", "within", "equal"))
+{
+ condition <- match.arg(type)
+ .GRangesFilter(
+ field = "granges",
+ value = value,
+ condition = condition,
+ feature = feature)
+}
+
+.feature <- function(object) object at feature
+
+#' @aliases feature
+#'
+#' @description \code{feature()} get the \code{feature} for the
+#' \code{GRangesFilter} \code{object}.
+#'
+#' @rdname AnnotationFilter
+#'
+#' @export
+feature <- .feature
+
+#' @importFrom GenomicRanges show
+#'
+#' @export
+setMethod("show", "GRangesFilter", function(object) {
+ callNextMethod()
+ cat("feature:", .feature(object),
+ "\nvalue:\n")
+ show(value(object))
+})
+
+
+############################################################
+## Create install-time classes
+##
+
+#' @rdname AnnotationFilter
+#'
+#' @name AnnotationFilter
+#'
+#' @param feature \code{character(1)} defining on what feature the
+#' \code{GRangesFilter} should be applied. Choices could be
+#' \code{"gene"}, \code{"tx"} or \code{"exon"}.
+#'
+#' @examples
+#' ## Create a SymbolFilter to filter on a gene's symbol.
+#' sf <- SymbolFilter("BCL2")
+#' sf
+#'
+#' ## Create a GeneStartFilter to filter based on the genes' chromosomal start
+#' ## coordinates
+#' gsf <- GeneStartFilter(10000, condition = ">")
+#' gsf
+#'
+#' @export CdsStartFilter CdsEndFilter ExonIdFilter ExonNameFilter
+#' @export ExonStartFilter ExonEndFilter ExonRankFilter GeneIdFilter
+#' @export GenenameFilter GeneBiotypeFilter GeneStartFilter
+#' @export GeneEndFilter EntrezFilter SymbolFilter TxIdFilter
+#' @export TxNameFilter TxBiotypeFilter TxStartFilter TxEndFilter
+#' @export ProteinIdFilter UniprotFilter SeqNameFilter SeqStrandFilter
+#'
+#' @importFrom methods new
+#'
+#' @exportClass CdsStartFilter CdsEndFilter ExonIdFilter
+#' ExonNameFilter ExonStartFilter ExonEndFilter ExonRankFilter
+#' GeneIdFilter GenenameFilter GeneBiotypeFilter GeneStartFilter
+#' GeneEndFilter EntrezFilter SymbolFilter TxIdFilter TxNameFilter
+#' TxBiotypeFilter TxStartFilter TxEndFilter ProteinIdFilter
+#' UniprotFilter SeqNameFilter SeqStrandFilter
+NULL
+
+.fieldToClass <- function(field) {
+ class <- gsub("_([[:alpha:]])", "\\U\\1", field, perl=TRUE)
+ class <- sub("^([[:alpha:]])", "\\U\\1", class, perl=TRUE)
+ paste0(class, if (length(class)) "Filter" else character(0))
+}
+
+.filterFactory <- function(field, class) {
+ force(field); force(class) # watch for lazy evaluation
+ as.value <-
+ if (field %in% .FIELD[["CharacterFilter"]]) {
+ as.character
+ } else {
+ function(x) {
+ stopifnot(is.numeric(x))
+ as.integer(x)
+ }
+ }
+
+ function(value, condition = "==") {
+ value <- as.value(value)
+ condition <- as.character(condition)
+ new(class, field=field, condition = condition, value=value)
+ }
+}
+
+local({
+ makeClass <- function(contains) {
+ fields <- .FIELD[[contains]]
+ classes <- .fieldToClass(fields)
+ for (i in seq_along(fields)) {
+ setClass(classes[[i]], contains=contains, where=topenv())
+ assign(
+ classes[[i]],
+ .filterFactory(fields[[i]], classes[[i]]),
+ envir=topenv()
+ )
+ }
+ }
+ for (contains in names(.FIELD))
+ makeClass(contains)
+})
+
+############################################################
+## Utilities - supportedFilters
+##
+
+.supportedFilters <- function() {
+ sort(c(.fieldToClass(unlist(.FIELD, use.names=FALSE)), "GRangesFilter"))
+}
+
+#' @rdname AnnotationFilter
+#'
+#' @examples
+#' supportedFilters()
+#' @export
+setMethod("supportedFilters", "missing", function(object) {
+ .supportedFilters()
+})
diff --git a/R/AnnotationFilterList.R b/R/AnnotationFilterList.R
new file mode 100644
index 0000000..734c93f
--- /dev/null
+++ b/R/AnnotationFilterList.R
@@ -0,0 +1,172 @@
+#' @include AnnotationFilter.R
+
+#' @rdname AnnotationFilterList
+#'
+#' @name AnnotationFilterList
+#'
+#' @title Combining annotation filters
+#'
+#' @aliases AnnotationFilterList-class
+#'
+#' @description The \code{AnnotationFilterList} allows to combine
+#' filter objects extending the \code{\link{AnnotationFilter}}
+#' class to construct more complex queries. Consecutive filter
+#' objects in the \code{AnnotationFilterList} can be combined by a
+#' logical \emph{and} (\code{&}) or \emph{or} (\code{|}). The
+#' \code{AnnotationFilterList} extends \code{list}, individual
+#' elements can thus be accessed with \code{[[}.
+#'
+#' @note The \code{AnnotationFilterList} does not support containing empty
+#' elements, hence all elements of \code{length == 0} are removed in
+#' the constructor function.
+#'
+#' @exportClass AnnotationFilterList
+NULL
+
+.AnnotationFilterList <- setClass(
+ "AnnotationFilterList",
+ contains = "list",
+ slots = c(logOp = "character")
+)
+
+.LOG_OPS <- c("&", "|")
+
+setValidity("AnnotationFilterList",
+ function(object)
+{
+ txt <- character()
+ filters <- .aflvalue(object)
+ logOp <- .logOp(object)
+ if (length(filters) == 0 && length(logOp)) {
+ txt <- c(
+ txt, "'logOp' can not have length > 0 if the object is empty"
+ )
+ } else if (length(filters) != 0) {
+ ## Note: we allow length of filters being 1, but then logOp has
+ ## to be empty. Check content:
+ fun <- function(z)
+ is(z, "AnnotationFilter") || is(z, "AnnotationFilterList")
+ test <- vapply(filters, fun, logical(1))
+ if (!all(test)){
+ txt <- c(
+ txt, "only 'AnnotationFilter' or 'AnnotationFilterList' allowed"
+ )
+ }
+ ## Check that all elements are non-empty (issue #17). Doing this
+ ## separately from the check above to ensure we get a different error
+ ## message.
+ if (!all(lengths(filters) > 0))
+ txt <- c(txt, "Lengths of all elements have to be > 0")
+ ## Check that logOp has length object -1
+ if (length(logOp) != length(filters) - 1)
+ txt <- c(txt, "length of 'logOp' has to be length of the object -1")
+ ## Check content of logOp.
+ if (!all(logOp %in% .LOG_OPS))
+ txt <- c(txt, "'logOp' can only contain '&' and '|'")
+ }
+
+ if (length(txt)) txt else TRUE
+})
+
+## AnnotationFilterList constructor function.
+#' @rdname AnnotationFilterList
+#'
+#' @name AnnotationFilterList
+#'
+#' @param ... individual \code{\link{AnnotationFilter}} objects or a
+#' mixture of \code{AnnotationFilter} and
+#' \code{AnnotationFilterList} objects.
+#'
+#' @param logOp \code{character} of length being equal to the numner
+#' of submitted \code{AnnotationFilter} objects -1. Each value
+#' representing the logical operation to combine consecutive
+#' filters, i.e. the first element being the logical operation to
+#' combine the first and second \code{AnnotationFilter}, the
+#' second element being the logical operation to combine the
+#' second and third \code{AnnotationFilter} and so on. Allowed
+#' values are \code{"&"} and \code{"|"}. The function assumes a
+#' logical \emph{and} between all elements by default.
+#'
+#' @seealso \code{\link{supportedFilters}} for available
+#' \code{\link{AnnotationFilter}} objects
+#'
+#' @return \code{AnnotationFilterList} returns an \code{AnnotationFilterList}.
+#'
+#' @examples
+#' ## Create some AnnotationFilters
+#' gf <- GenenameFilter(c("BCL2", "BCL2L11"))
+#' tbtf <- TxBiotypeFilter("protein_coding", condition = "!=")
+#'
+#' ## Combine both to an AnnotationFilterList. By default elements are combined
+#' ## using a logical "and" operator. The filter list represents thus a query
+#' ## like: get all features where the gene name is either ("BCL2" or "BCL2L11")
+#' ## and the transcript biotype is not "protein_coding".
+#' afl <- AnnotationFilterList(gf, tbtf)
+#' afl
+#'
+#' ## Access individual filters.
+#' afl[[1]]
+#'
+#' ## Create a filter in the form of: get all features where the gene name is
+#' ## either ("BCL2" or "BCL2L11") and the transcript biotype is not
+#' ## "protein_coding" or the seq_name is "Y". Hence, this will get all feature
+#' ## also found by the previous AnnotationFilterList and returns also all
+#' ## features on chromosome Y.
+#' afl <- AnnotationFilterList(gf, tbtf, SeqNameFilter("Y"),
+#' logOp = c("&", "|"))
+#' afl
+#'
+#' @export
+AnnotationFilterList <-
+ function(..., logOp = character())
+{
+ filters <- list(...)
+ ## Remove empty elements (issue #17)
+ filters <- filters[lengths(filters) > 0]
+ ## By default we're assuming & between elements.
+ if (length(filters) > 1 & length(logOp) == 0)
+ logOp <- rep("&", (length(filters) - 1))
+ .AnnotationFilterList(filters, logOp = logOp)
+}
+
+.logOp <- function(object) object at logOp
+
+.aflvalue <- function(object) object at .Data
+
+#' @rdname AnnotationFilterList
+#'
+#' @description \code{value()} get a \code{list} with the
+#' \code{AnnotationFilter} objects. Use \code{[[} to access
+#' individual filters.
+#'
+#' @return \code{value} returns a \code{list} with \code{AnnotationFilter}
+#' objects.
+#'
+#' @export
+setMethod("value", "AnnotationFilterList", .aflvalue)
+
+
+#' @rdname AnnotationFilterList
+#'
+#' @param object An object of class \code{AnnotationFilterList}.
+#'
+#' @importFrom utils tail
+#' @export
+setMethod("show", "AnnotationFilterList",
+ function(object)
+{
+ cat("class:", class(object),
+ "\nlength:", length(object)
+ )
+
+ if (length(object) == 0L)
+ return()
+
+ cat("\nfilters:\n\n")
+ show(object[[1L]])
+ for (i in tail(seq_along(object), -1L)) {
+ cat("\n", .logOp(object)[i - 1L], "\n\n")
+ show(object[[i]])
+ }
+})
+
diff --git a/R/translate-utils.R b/R/translate-utils.R
new file mode 100644
index 0000000..5232554
--- /dev/null
+++ b/R/translate-utils.R
@@ -0,0 +1,120 @@
+#' @include AnnotationFilter.R
+
+## Functionality to translate a query condition to an AnnotationFilter.
+
+#' Adapted from GenomicDataCommons.
+#'
+#' @importFrom methods is validObject initialize
+#'
+#' @noRd
+.binary_op <- function(sep) {
+ force(sep)
+ function(e1, e2) {
+ ## First create the class. Throws an error if not possible i.e. no
+ ## class for the field available.
+ field <- as.character(substitute(e1))
+ class <- .fieldToClass(field)
+ filter <- tryCatch({
+ new(class, condition = sep, field = field)
+ }, error = function(e) {
+ stop("No AnnotationFilter class '", class, "' for field '",
+ field, "' defined")
+ })
+ ## Fill with values.
+ force(e2)
+ if (is(filter, "CharacterFilter")) {
+ e2 <- as.character(e2)
+ } else if (is(filter, "IntegerFilter")) {
+ e2 <- as.integer(e2)
+ }
+ initialize(filter, value = e2)
+ }
+}
+
+#' Combine filters into a AnnotationFilterList combbined with \code{sep}
+#'
+#' @noRd
+.combine_op <- function(sep) {
+ force(sep)
+ function(e1, e2) {
+ ## Avoid implicit nesting of AnnotationFilterList - should be done
+ ## eventually
+ if (is(e1, "AnnotationFilterList")) {
+ sep <- c(.logOp(e1), sep)
+ e1 <- .aflvalue(e1)
+ } else
+ e1 <- list(e1)
+ if (is(e2, "AnnotationFilterList")) {
+ sep <- c(.logOp(e2), sep)
+ e2 <- .aflvalue(e2)
+ } else
+ e2 <- list(e2)
+ ## Don't use the constructor here.
+ new("AnnotationFilterList", c(e1, e2), logOp = sep)
+ }
+}
+
+#' The \code{.LOG_OP_REG} is a \code{list} providing functions for
+#' common logical operations to translate expressions into AnnotationFilter
+#' objects.
+#'
+#' @noRd
+.LOG_OP_REG <- list()
+## Assign conditions.
+.LOG_OP_REG$`==` <- .binary_op("==")
+.LOG_OP_REG$`%in%` <- .binary_op("==")
+.LOG_OP_REG$`!=` <- .binary_op("!=")
+.LOG_OP_REG$`>` <- .binary_op(">")
+.LOG_OP_REG$`<` <- .binary_op("<")
+.LOG_OP_REG$`>=` <- .binary_op(">=")
+.LOG_OP_REG$`<=` <- .binary_op("<=")
+## combine filters
+.LOG_OP_REG$`&` <- .combine_op("&")
+.LOG_OP_REG$`|` <- .combine_op("|")
+
+#' @rdname AnnotationFilter
+#'
+#' @description \code{AnnotationFilter} \emph{translates} a filter
+#' expression such as \code{~ gene_id == "BCL2"} into a filter object
+#' extending the \code{\link{AnnotationFilter}} class (in the example a
+#' \code{\link{GeneIdFilter}} object) or an
+#' \code{\link{AnnotationFilterList}} if the expression contains multiple
+#' conditions (see examples below).
+#'
+#' @details Filter expressions for the \code{AnnotationFilter} class have to be
+#' written as formulas, i.e. starting with a \code{~}.
+#'
+#' @note Translation of nested filter expressions using the
+#' \code{AnnotationFilter} function is not yet supported.
+#'
+#' @param expr A filter expression, written as a \code{formula}, to be
+#' converted to an \code{AnnotationFilter} or \code{AnnotationFilterList}
+#' class. See below for examples.
+#'
+#' @return \code{AnnotationFilter} returns an
+#' \code{\link{AnnotationFilter}} or an \code{\link{AnnotationFilterList}}.
+#'
+#' @importFrom lazyeval f_eval
+#'
+#' @examples
+#'
+#' ## Convert a filter expression based on a gene ID to a GeneIdFilter
+#' gnf <- AnnotationFilter(~ gene_id == "BCL2")
+#' gnf
+#'
+#' ## Same conversion but for two gene IDs.
+#' gnf <- AnnotationFilter(~ gene_id %in% c("BCL2", "BCL2L11"))
+#' gnf
+#'
+#' ## Converting an expression that combines multiple filters. As a result we
+#' ## get an AnnotationFilterList containing the corresponding filters.
+#' ## Be aware that nesting of expressions/filters does not work.
+#' flt <- AnnotationFilter(~ gene_id %in% c("BCL2", "BCL2L11") &
+#' tx_biotype == "nonsense_mediated_decay" |
+#' seq_name == "Y")
+#' flt
+#'
+#' @export
+AnnotationFilter <- function(expr) {
+ f_eval(expr, data = .LOG_OP_REG)
+}
diff --git a/README b/README
new file mode 100644
index 0000000..c91610d
--- /dev/null
+++ b/README
@@ -0,0 +1,26 @@
+Package: AnnotationFilter
+Title: Facilities for Filtering Bioconductor Annotation Resources
+Version: 0.99.8
+Authors at R: c( person("Martin", "Morgan", email =
+ "martin.morgan at roswellpark.org", role = "aut"),
+ person("Johannes", "Rainer", email =
+ "johannes.rainer at eurac.edu", role = "aut"),
+ person("Bioconductor", "Maintainer",
+ email="maintainer at bioconductor.org", role = "cre"))
+URL: https://github.com/Bioconductor/AnnotationFilter
+BugReports: https://github.com/Bioconductor/AnnotationFilter/issues
+Description: This package provides class and other infrastructure to
+ implement filters for manipulating Bioconductor annotation
+ resources. The filters will be used by ensembldb,
+ Organism.dplyr, and other packages.
+Depends: R (>= 3.4.0)
+Imports: utils, methods, GenomicRanges, lazyeval
+Suggests: BiocStyle, knitr, testthat, RSQLite, org.Hs.eg.db
+VignetteBuilder: knitr
+License: Artistic-2.0
+biocViews: Annotation, Infrastructure, Software
+Encoding: UTF-8
+LazyData: true
+RoxygenNote: 6.0.1
+Collate: 'AllGenerics.R' 'AnnotationFilter.R' 'AnnotationFilterList.R'
+ 'translate-utils.R'
diff --git a/build/vignette.rds b/build/vignette.rds
new file mode 100644
index 0000000..ec7ec6d
Binary files /dev/null and b/build/vignette.rds differ
diff --git a/inst/doc/AnnotationFilter.R b/inst/doc/AnnotationFilter.R
new file mode 100644
index 0000000..18b925e
--- /dev/null
+++ b/inst/doc/AnnotationFilter.R
@@ -0,0 +1,159 @@
+## ----style, echo = FALSE, results = 'asis', message=FALSE------------------
+BiocStyle::markdown()
+
+## ----supportedFilters------------------------------------------------------
+library(AnnotationFilter)
+supportedFilters()
+
+## ----symbol-filter---------------------------------------------------------
+library(AnnotationFilter)
+
+smbl <- SymbolFilter("BCL2")
+smbl
+
+## ----symbol-startsWith-----------------------------------------------------
+smbl <- SymbolFilter("BCL2", condition = "startsWith")
+smbl
+
+## ----convert-expression----------------------------------------------------
+smbl <- AnnotationFilter(~ symbol == "BCL2")
+smbl
+
+## ----convert-multi-expression----------------------------------------------
+flt <- AnnotationFilter(~ symbol == "BCL2" &
+ tx_biotype == "protein_coding")
+flt
+
+## ----nested-query----------------------------------------------------------
+## Define the filter query for the first pair of filters.
+afl1 <- AnnotationFilterList(SymbolFilter("BCL2L11"),
+ TxBiotypeFilter("nonsense_mediated_decay"))
+## Define the second filter pair in ( brackets should be combined.
+afl2 <- AnnotationFilterList(SymbolFilter("BCL2"),
+ TxBiotypeFilter("protein_coding"))
+## Now combine both with a logical OR
+afl <- AnnotationFilterList(afl1, afl2, logOp = "|")
+
+afl
+
+## ----define-data.frame-----------------------------------------------------
+## Define a simple gene table
+gene <- data.frame(gene_id = 1:10,
+ symbol = c(letters[1:9], "b"),
+ seq_name = paste0("chr", c(1, 4, 4, 8, 1, 2, 5, 3, "X", 4)),
+ stringsAsFactors = FALSE)
+gene
+
+## ----simple-symbol---------------------------------------------------------
+smbl <- SymbolFilter("b")
+
+## ----simple-symbol-condition-----------------------------------------------
+condition(smbl)
+
+## ----simple-symbol-value---------------------------------------------------
+value(smbl)
+
+## ----simple-symbol-field---------------------------------------------------
+field(smbl)
+
+## ----doMatch---------------------------------------------------------------
+
+doMatch <- function(x, filter) {
+ do.call(condition(filter), list(x[, field(filter)], value(filter)))
+}
+
+## Apply this function
+doMatch(gene, smbl)
+
+
+## ----doExtract-------------------------------------------------------------
+
+doExtract <- function(x, filter) {
+ x[doMatch(x, filter), ]
+}
+
+## Apply it on the data
+doExtract(gene, smbl)
+
+## ----doMatch-formula-------------------------------------------------------
+
+doMatch <- function(x, filter) {
+ if (is(filter, "formula"))
+ filter <- AnnotationFilter(filter)
+ do.call(condition(filter), list(x[, field(filter)], value(filter)))
+}
+
+doExtract(gene, ~ gene_id == '2')
+
+
+## ----orgDb, message = FALSE------------------------------------------------
+## Load the required packages
+library(org.Hs.eg.db)
+library(RSQLite)
+## Get the database connection
+dbcon <- org.Hs.eg_dbconn()
+
+## What tables do we have?
+dbListTables(dbcon)
+
+## ----gene_info-------------------------------------------------------------
+## What fields are there in the gene_info table?
+dbListFields(dbcon, "gene_info")
+
+## ----doExtractSQL----------------------------------------------------------
+
+doExtractGene <- function(x, filter) {
+ gene <- dbGetQuery(x, "select * from gene_info")
+ doExtract(gene, filter)
+}
+
+## Extract all entries for BCL2
+bcl2 <- doExtractGene(dbcon, SymbolFilter("BCL2"))
+
+bcl2
+
+## ----simpleSQL-------------------------------------------------------------
+
+## Define a simple function that covers some condition conversion
+conditionForSQL <- function(x) {
+ switch(x,
+ "==" = "=",
+ x)
+}
+
+## Define a function to translate a filter into an SQL where condition.
+## Character values have to be quoted.
+where <- function(x) {
+ if (is(x, "CharacterFilter"))
+ value <- paste0("'", value(x), "'")
+ else value <- value(x)
+ paste0(field(x), conditionForSQL(condition(x)), value)
+}
+
+## Now "translate" a filter using this function
+where(SeqNameFilter("Y"))
+
+
+## ----doExtractGene2--------------------------------------------------------
+
+## Define a function that
+doExtractGene2 <- function(x, filter) {
+ if (is(filter, "formula"))
+ filter <- AnnotationFilter(filter)
+ query <- paste0("select * from gene_info where ", where(filter))
+ dbGetQuery(x, query)
+}
+
+bcl2 <- doExtractGene2(dbcon, ~ symbol == "BCL2")
+bcl2
+
+
+## ----performance-----------------------------------------------------------
+system.time(doExtractGene(dbcon, ~ symbol == "BCL2"))
+
+system.time(doExtractGene2(dbcon, ~ symbol == "BCL2"))
+
+
+## ----si--------------------------------------------------------------------
+sessionInfo()
+
diff --git a/inst/doc/AnnotationFilter.Rmd b/inst/doc/AnnotationFilter.Rmd
new file mode 100644
index 0000000..0dddd45
--- /dev/null
+++ b/inst/doc/AnnotationFilter.Rmd
@@ -0,0 +1,404 @@
+---
+title: "Facilities for Filtering Bioconductor Annotation Resources"
+output:
+ BiocStyle::html_document2:
+ toc_float: true
+vignette: >
+ %\VignetteIndexEntry{Facilities for Filtering Bioconductor Annotation resources}
+ %\VignetteEngine{knitr::rmarkdown}
+ %\VignetteEncoding{UTF-8}
+ %\VignettePackage{AnnotationFilter}
+ %\VignetteDepends{org.Hs.eg.db,BiocStyle,RSQLite}
+---
+
+```{r style, echo = FALSE, results = 'asis', message=FALSE}
+BiocStyle::markdown()
+```
+
+**Package**: `r Biocpkg("AnnotationFilter")`<br />
+**Authors**: `r packageDescription("AnnotationFilter")[["Author"]] `<br />
+**Last modified:** `r file.info("AnnotationFilter.Rmd")$mtime`<br />
+**Compiled**: `r date()`
+
+
+# Introduction
+
+A large variety of annotation resources are available in Bioconductor. Accessing
+the full content of these databases or even of single tables is computationally
+expensive and in many instances not required, as users may want to extract only
+sub-sets of the data e.g. genomic coordinates of a single gene. In that respect,
+filtering annotation resources before data extraction has a major impact on
+performance and increases the usability of such genome-scale databases.
+
+The `r Biocpkg("AnnotationFilter")` package was thus developed to provide basic
+filter classes to enable a common filtering framework for Bioconductor
+annotation resources. `r Biocpkg("AnnotationFilter")` defines filter classes for
+some of the most commonly used features in annotation databases, such as
+*symbol* or *genename*. Each filter class is supposed to work on a single
+database table column and to facilitate filtering on the provided values. Such
+filter classes enable the user to build complex queries to retrieve specific
+annotations without needing to know column or table names or the layout of the
+underlying databases. While initially being developed to be used in the
+`r Biocpkg("Organism.dplyr")` and `r Biocpkg("ensembldb")` packages, the filter
+classes and the related filtering concept can be easily added to other
+annotation packages too.
+
+
+# Filter classes
+
+All filter classes extend the basic `AnnotationFilter` class and take one or
+more *values* and a *condition* to allow filtering on a single database table
+column. Based on the type of the input value, filter classes are divided into:
+
+- `CharacterFilter`: takes a `character` value of length >= 1 and supports
+ conditions `==`, `!=`, `startsWith` and `endsWith`. An example would be a
+ `GeneIdFilter` that allows to filter on gene IDs.
+
+- `IntegerFilter`: takes a single `integer` as input and supports the conditions
+ `==`, `!=`, `>`, `<`, `>=` and `<=`. An example would be a `GeneStartFilter`
+ that filters results on the (chromosomal) start coordinates of genes.
+
+- `GRangesFilter`: is a special filter, as it takes a `GRanges` as `value` and
+ performs the filtering on a combination of columns (i.e. start and end
+ coordinate as well as sequence name and strand). To be consistent with the
+ `findOverlaps` method from the `r Biocpkg("IRanges")` package, the constructor
+ of the `GRangesFilter` filter takes a `type` argument to define its
+ condition. Supported values are `"any"` (the default) that retrieves all
+ entries overlapping the `GRanges`, `"start"` and `"end"` matching all features
+ with the same start and end coordinate respectively, `"within"` that matches
+ all features that are *within* the range defined by the `GRanges` and
+ `"equal"` that returns features that are equal to the `GRanges`.
+
+The names of the filter classes are intuitive, the first part corresponding to
+the database column name with each character following a `_` being capitalized,
+followed by the key word `Filter`. The name of a filter for a database table
+column `gene_id` is thus called `GeneIdFilter`. The default database column for
+a filter is stored in its `field` slot (accessible *via* the `field` method).
+
+The `supportedFilters` method can be used to get an overview of all available
+filter objects defined in `AnnotationFilter`.
+
+```{r supportedFilters}
+library(AnnotationFilter)
+supportedFilters()
+```
+
+Note that the `AnnotationFilter` package does provides only the filter classes
+but not the functionality to apply the filtering. Such functionality is
+annotation resource and database layout dependent and needs thus to be
+implemented in the packages providing access to annotation resources.
+
+
+# Usage
+
+Filters are created *via* their dedicated constructor functions, such as the
+`GeneIdFilter` function for the `GeneIdFilter` class. Because of this simple and
+cheap creation, filter classes are thought to be *read-only* and thus don't
+provide *setter* methods to change their slot values. In addition to the
+constructor functions, `AnnotationFilter` provides the functionality to
+*translate* query expressions into filter classes (see further below for an
+example).
+
+Below we create a `SymbolFilter` that could be used to filter an annotation
+resource to retrieve all entries associated with the specified symbol value(s).
+
+```{r symbol-filter}
+library(AnnotationFilter)
+
+smbl <- SymbolFilter("BCL2")
+smbl
+```
+
+Such a filter is supposed to be used to retrieve all entries associated to
+features with a value in a database table column called *symbol* matching the
+filter's value `"BCL2"`.
+
+Using the `"startsWith"` condition we could define a filter to retrieve all
+entries for genes with a gene name/symbol starting with the specified value
+(e.g. `"BCL2"` and `"BCL2L11"` for the example below.
+
+```{r symbol-startsWith}
+smbl <- SymbolFilter("BCL2", condition = "startsWith")
+smbl
+```
+
+In addition to the constructor functions, `AnnotationFilter` provides a
+functionality to create filter instances in a more natural and intuitive way by
+*translating* filter expressions (written as a *formula*, i.e. starting with a
+`~`).
+
+```{r convert-expression}
+smbl <- AnnotationFilter(~ symbol == "BCL2")
+smbl
+```
+
+Individual `AnnotationFilter` objects can be combined in an
+`AnnotationFilterList`. This class extends `list` and provides an additional
+`logOp` slot that defines how its individual filters are supposed to be
+combined. The length of `logOp` has to be 1 less than the number of filter
+objects. Each element in `logOp` defines how two consecutive filters should
+be combined. Below we create a `AnnotationFilterList` containing two filter
+objects to be combined with a logical *AND*.
+
+```{r convert-multi-expression}
+flt <- AnnotationFilter(~ symbol == "BCL2" &
+ tx_biotype == "protein_coding")
+flt
+```
+
+Note that the `AnnotationFilter` function does not (yet) support translation of
+nested expressions, such as `(symbol == "BCL2L11" & tx_biotype ==
+"nonsense_mediated_decay") | (symbol == "BCL2" & tx_biotype ==
+"protein_coding")`. Such queries can however be build by nesting
+`AnnotationFilterList` classes.
+
+```{r nested-query}
+## Define the filter query for the first pair of filters.
+afl1 <- AnnotationFilterList(SymbolFilter("BCL2L11"),
+ TxBiotypeFilter("nonsense_mediated_decay"))
+## Define the second filter pair in ( brackets should be combined.
+afl2 <- AnnotationFilterList(SymbolFilter("BCL2"),
+ TxBiotypeFilter("protein_coding"))
+## Now combine both with a logical OR
+afl <- AnnotationFilterList(afl1, afl2, logOp = "|")
+
+afl
+```
+
+This `AnnotationFilterList` would now select all entries for all transcripts of
+the gene *BCL2L11* with the biotype *nonsense_mediated_decay* or for all protein
+coding transcripts of the gene *BCL2*.
+
+
+# Using `AnnotationFilter` in other packages
+
+The `AnnotationFilter` package does only provide filter classes, but no
+filtering functionality. This has to be implemented in the package using the
+filters. In this section we first show in a very simple example how
+`AnnotationFilter` classes could be used to filter a `data.frame` and
+subsequently explore how a simple filter framework could be implemented for a
+SQL based annotation resources.
+
+Let's first define a simple `data.frame` containing the data we want to
+filter. Note that subsetting this `data.frame` using `AnnotationFilter` is
+obviously not the best solution, but it should help to understand the basic
+concept.
+
+```{r define-data.frame}
+## Define a simple gene table
+gene <- data.frame(gene_id = 1:10,
+ symbol = c(letters[1:9], "b"),
+ seq_name = paste0("chr", c(1, 4, 4, 8, 1, 2, 5, 3, "X", 4)),
+ stringsAsFactors = FALSE)
+gene
+```
+
+Next we generate a `SymbolFilter` and inspect what information we can extract
+from it.
+
+```{r simple-symbol}
+smbl <- SymbolFilter("b")
+```
+
+We can access the filter *condition* using the `condition` method
+
+```{r simple-symbol-condition}
+condition(smbl)
+```
+
+The value of the filter using the `value` method
+
+```{r simple-symbol-value}
+value(smbl)
+```
+
+And finally the *field* (i.e. column in the data table) using the `field`
+method.
+
+```{r simple-symbol-field}
+field(smbl)
+```
+
+With this information we can define a simple function that takes the data table
+and the filter as input and returns a `logical` with length equal to the number
+of rows of the table, `TRUE` for rows matching the filter.
+
+```{r doMatch}
+
+doMatch <- function(x, filter) {
+ do.call(condition(filter), list(x[, field(filter)], value(filter)))
+}
+
+## Apply this function
+doMatch(gene, smbl)
+
+```
+
+Note that this simple function does not support multiple filters and also not
+conditions `"startsWith"` or `"endsWith"`. Next we define a second function that
+extracts the relevant data from the data resource.
+
+```{r doExtract}
+
+doExtract <- function(x, filter) {
+ x[doMatch(x, filter), ]
+}
+
+## Apply it on the data
+doExtract(gene, smbl)
+```
+
+We could even modify the `doMatch` function to enable filter expressions.
+
+```{r doMatch-formula}
+
+doMatch <- function(x, filter) {
+ if (is(filter, "formula"))
+ filter <- AnnotationFilter(filter)
+ do.call(condition(filter), list(x[, field(filter)], value(filter)))
+}
+
+doExtract(gene, ~ gene_id == '2')
+
+```
+
+For such simple examples `AnnotationFilter` might be an overkill as the same
+could be achieved (much simpler) using standard R operations. A real case
+scenario in which `AnnotationFilter` becomes useful are SQL-based annotation
+resources. We will thus explore next how SQL resources could be filtered using
+`AnnotationFilter`.
+
+We use the SQLite database from the `r Biocpkg("org.Hs.eg.db")` package that
+provides a variety of annotations for all human genes. Using the packages'
+connection to the database we inspect first what database tables are available
+and then select one for our simple filtering example.
+
+We use an `EnsDb` SQLite database used by the `r Biocpkg("ensembldb")` package
+and implement simple filter functions to extract specific data from one of its
+database tables. We thus load below the `EnsDb.Hsapiens.v75` package that
+provides access to human gene, transcript, exon and protein annotations. Using
+its connection to the database we inspect first what database tables are
+available and then what *fields* (i.e. columns) the *gene* table has.
+
+```{r orgDb, message = FALSE}
+## Load the required packages
+library(org.Hs.eg.db)
+library(RSQLite)
+## Get the database connection
+dbcon <- org.Hs.eg_dbconn()
+
+## What tables do we have?
+dbListTables(dbcon)
+```
+
+`org.Hs.eg.db` provides many different tables, one for each identifier or
+annotation resource. We will use the *gene_info* table and determine which
+*fields* (i.e. columns) the table provides.
+
+```{r gene_info}
+## What fields are there in the gene_info table?
+dbListFields(dbcon, "gene_info")
+```
+
+The *gene_info* table provides the official gene symbol and the gene name. The
+column *symbol* matches the default `field` value of the `SymbolFilter`. For the
+`GenenameFilter` we would have to re-map its default field `"genename"` to the
+database column *gene_name*. There are many possibilities to do this, one would
+be to implement an own function to extract the field from the `AnnotationFilter`
+classes specific to the database. This function eventually renames the extracted
+field value to match the corresponding name of the database column name.
+
+We next implement a simple `doExtractGene` function that retrieves data from the
+*gene_info* table and re-uses the `doFilter` function to extract specific
+data. The parameter `x` is now the database connection object.
+
+```{r doExtractSQL}
+
+doExtractGene <- function(x, filter) {
+ gene <- dbGetQuery(x, "select * from gene_info")
+ doExtract(gene, filter)
+}
+
+## Extract all entries for BCL2
+bcl2 <- doExtractGene(dbcon, SymbolFilter("BCL2"))
+
+bcl2
+```
+
+This works, but is not really efficient, since the function first fetches the
+full database table and subsets it only afterwards. A much more efficient
+solution is to *translate* the `AnnotationFilter` class(es) to an SQL *where*
+condition and hence perform the filtering on the database level. Here we have to
+do some small modifications, since not all condition values can be used 1:1 in
+SQL calls. The condition `"=="` has for example to be converted into `"="` and
+the `"startsWith"` into a SQL `"like"` by adding also a `"%"` wildcard to the
+value of the filter. We would also have to deal with filters that have a `value`
+of length > 1. A `SymbolFilter` with a `value` being `c("BCL2", "BCL2L11")`
+would for example have to be converted to a SQL call `"symbol in
+('BCL2','BCL2L11')"`. Here we skip these special cases and define a simple
+function that translates an `AnnotationFilter` to a *where* condition to be
+included into the SQL call. Depending on whether the filter extends
+`CharacterFilter` or `IntegerFilter` the value has also to be quoted.
+
+```{r simpleSQL}
+
+## Define a simple function that covers some condition conversion
+conditionForSQL <- function(x) {
+ switch(x,
+ "==" = "=",
+ x)
+}
+
+## Define a function to translate a filter into an SQL where condition.
+## Character values have to be quoted.
+where <- function(x) {
+ if (is(x, "CharacterFilter"))
+ value <- paste0("'", value(x), "'")
+ else value <- value(x)
+ paste0(field(x), conditionForSQL(condition(x)), value)
+}
+
+## Now "translate" a filter using this function
+where(SeqNameFilter("Y"))
+
+```
+
+Next we implement a new function which integrates the filter into the SQL call
+to let the database server take care of the filtering.
+
+```{r doExtractGene2}
+
+## Define a function that
+doExtractGene2 <- function(x, filter) {
+ if (is(filter, "formula"))
+ filter <- AnnotationFilter(filter)
+ query <- paste0("select * from gene_info where ", where(filter))
+ dbGetQuery(x, query)
+}
+
+bcl2 <- doExtractGene2(dbcon, ~ symbol == "BCL2")
+bcl2
+
+```
+
+Below we compare the performance of both approaches.
+
+```{r performance}
+system.time(doExtractGene(dbcon, ~ symbol == "BCL2"))
+
+system.time(doExtractGene2(dbcon, ~ symbol == "BCL2"))
+
+```
+
+Not surprisingly, the second approach is much faster.
+
+Be aware that the examples shown here are only for illustration purposes. In a
+real world situation additional factors, like combinations of filters, which
+database tables to join, which columns to be returned etc would have to be
+considered too.
+
+# Session information
+
+```{r si}
+sessionInfo()
+```
diff --git a/inst/doc/AnnotationFilter.html b/inst/doc/AnnotationFilter.html
new file mode 100644
index 0000000..790b545
--- /dev/null
+++ b/inst/doc/AnnotationFilter.html
@@ -0,0 +1,571 @@
+<!DOCTYPE html>
+
+<html xmlns="http://www.w3.org/1999/xhtml">
+
+<head>
+
+<meta charset="utf-8" />
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+<meta name="generator" content="pandoc" />
+
+
+
+<meta name="date" content="2017-04-24" />
+
+<title>Facilities for Filtering Bioconductor Annotation Resources</title>
+
+<script src="data:application/x-javascript;base64,LyohIGpRdWVyeSB2MS4xMS4zIHwgKGMpIDIwMDUsIDIwMTUgalF1ZXJ5IEZvdW5kYXRpb24sIEluYy4gfCBqcXVlcnkub3JnL2xpY2Vuc2UgKi8KIWZ1bmN0aW9uKGEsYil7Im9iamVjdCI9PXR5cGVvZiBtb2R1bGUmJiJvYmplY3QiPT10eXBlb2YgbW9kdWxlLmV4cG9ydHM/bW9kdWxlLmV4cG9ydHM9YS5kb2N1bWVudD9iKGEsITApOmZ1bmN0aW9uKGEpe2lmKCFhLmRvY3VtZW50KXRocm93IG5ldyBFcnJvcigialF1ZXJ5IHJlcXVpcmVzIGEgd2luZG93IHdpdGggYSBkb2N1bWVudCIpO3JldHVybiBiKGEpfTpiKGEpfSgidW5kZWZpbmVkIiE9dHlwZW9mIHdpbmRvdz93aW5kb3c6dG [...]
+<meta name="viewport" content="width=device-width, initial-scale=1" />
+<link href="data:text/css;charset=utf-8,html%7Bfont%2Dfamily%3Asans%2Dserif%3B%2Dwebkit%2Dtext%2Dsize%2Dadjust%3A100%25%3B%2Dms%2Dtext%2Dsize%2Dadjust%3A100%25%7Dbody%7Bmargin%3A0%7Darticle%2Caside%2Cdetails%2Cfigcaption%2Cfigure%2Cfooter%2Cheader%2Chgroup%2Cmain%2Cmenu%2Cnav%2Csection%2Csummary%7Bdisplay%3Ablock%7Daudio%2Ccanvas%2Cprogress%2Cvideo%7Bdisplay%3Ainline%2Dblock%3Bvertical%2Dalign%3Abaseline%7Daudio%3Anot%28%5Bcontrols%5D%29%7Bdisplay%3Anone%3Bheight%3A0%7D%5Bhidden%5D%2Ctem [...]
+<script src="data:application/x-javascript;base64,LyohCiAqIEJvb3RzdHJhcCB2My4zLjUgKGh0dHA6Ly9nZXRib290c3RyYXAuY29tKQogKiBDb3B5cmlnaHQgMjAxMS0yMDE1IFR3aXR0ZXIsIEluYy4KICogTGljZW5zZWQgdW5kZXIgdGhlIE1JVCBsaWNlbnNlCiAqLwppZigidW5kZWZpbmVkIj09dHlwZW9mIGpRdWVyeSl0aHJvdyBuZXcgRXJyb3IoIkJvb3RzdHJhcCdzIEphdmFTY3JpcHQgcmVxdWlyZXMgalF1ZXJ5Iik7K2Z1bmN0aW9uKGEpeyJ1c2Ugc3RyaWN0Ijt2YXIgYj1hLmZuLmpxdWVyeS5zcGxpdCgiICIpWzBdLnNwbGl0KCIuIik7aWYoYlswXTwyJiZiWzFdPDl8fDE9PWJbMF0mJjk9PWJbMV0mJmJbMl08MSl0aHJvdy [...]
+<script src="data:application/x-javascript;base64,LyoqCiogQHByZXNlcnZlIEhUTUw1IFNoaXYgMy43LjIgfCBAYWZhcmthcyBAamRhbHRvbiBAam9uX25lYWwgQHJlbSB8IE1JVC9HUEwyIExpY2Vuc2VkCiovCi8vIE9ubHkgcnVuIHRoaXMgY29kZSBpbiBJRSA4CmlmICghIXdpbmRvdy5uYXZpZ2F0b3IudXNlckFnZW50Lm1hdGNoKCJNU0lFIDgiKSkgewohZnVuY3Rpb24oYSxiKXtmdW5jdGlvbiBjKGEsYil7dmFyIGM9YS5jcmVhdGVFbGVtZW50KCJwIiksZD1hLmdldEVsZW1lbnRzQnlUYWdOYW1lKCJoZWFkIilbMF18fGEuZG9jdW1lbnRFbGVtZW50O3JldHVybiBjLmlubmVySFRNTD0ieDxzdHlsZT4iK2IrIjwvc3R5bGU+IixkLm [...]
+<script src="data:application/x-javascript;base64,LyohIFJlc3BvbmQuanMgdjEuNC4yOiBtaW4vbWF4LXdpZHRoIG1lZGlhIHF1ZXJ5IHBvbHlmaWxsICogQ29weXJpZ2h0IDIwMTMgU2NvdHQgSmVobAogKiBMaWNlbnNlZCB1bmRlciBodHRwczovL2dpdGh1Yi5jb20vc2NvdHRqZWhsL1Jlc3BvbmQvYmxvYi9tYXN0ZXIvTElDRU5TRS1NSVQKICogICovCgovLyBPbmx5IHJ1biB0aGlzIGNvZGUgaW4gSUUgOAppZiAoISF3aW5kb3cubmF2aWdhdG9yLnVzZXJBZ2VudC5tYXRjaCgiTVNJRSA4IikpIHsKIWZ1bmN0aW9uKGEpeyJ1c2Ugc3RyaWN0IjthLm1hdGNoTWVkaWE9YS5tYXRjaE1lZGlhfHxmdW5jdGlvbihhKXt2YXIgYixjPWEuZG [...]
+<script src="data:application/x-javascript;base64,LyohIGpRdWVyeSBVSSAtIHYxLjExLjQgLSAyMDE2LTAxLTA1CiogaHR0cDovL2pxdWVyeXVpLmNvbQoqIEluY2x1ZGVzOiBjb3JlLmpzLCB3aWRnZXQuanMsIG1vdXNlLmpzLCBwb3NpdGlvbi5qcywgZHJhZ2dhYmxlLmpzLCBkcm9wcGFibGUuanMsIHJlc2l6YWJsZS5qcywgc2VsZWN0YWJsZS5qcywgc29ydGFibGUuanMsIGFjY29yZGlvbi5qcywgYXV0b2NvbXBsZXRlLmpzLCBidXR0b24uanMsIGRpYWxvZy5qcywgbWVudS5qcywgcHJvZ3Jlc3NiYXIuanMsIHNlbGVjdG1lbnUuanMsIHNsaWRlci5qcywgc3Bpbm5lci5qcywgdGFicy5qcywgdG9vbHRpcC5qcywgZWZmZWN0LmpzLC [...]
+<link href="data:text/css;charset=utf-8,%0A%0A%2Etocify%20%7B%0Awidth%3A%2020%25%3B%0Amax%2Dheight%3A%2090%25%3B%0Aoverflow%3A%20auto%3B%0Amargin%2Dleft%3A%202%25%3B%0Aposition%3A%20fixed%3B%0Aborder%3A%201px%20solid%20%23ccc%3B%0Awebkit%2Dborder%2Dradius%3A%206px%3B%0Amoz%2Dborder%2Dradius%3A%206px%3B%0Aborder%2Dradius%3A%206px%3B%0A%7D%0A%0A%2Etocify%20ul%2C%20%2Etocify%20li%20%7B%0Alist%2Dstyle%3A%20none%3B%0Amargin%3A%200%3B%0Apadding%3A%200%3B%0Aborder%3A%20none%3B%0Aline%2Dheight%3 [...]
+<script src="data:application/x-javascript;base64,LyoganF1ZXJ5IFRvY2lmeSAtIHYxLjkuMSAtIDIwMTMtMTAtMjIKICogaHR0cDovL3d3dy5ncmVnZnJhbmtvLmNvbS9qcXVlcnkudG9jaWZ5LmpzLwogKiBDb3B5cmlnaHQgKGMpIDIwMTMgR3JlZyBGcmFua287IExpY2Vuc2VkIE1JVCAqLwoKLy8gSW1tZWRpYXRlbHktSW52b2tlZCBGdW5jdGlvbiBFeHByZXNzaW9uIChJSUZFKSBbQmVuIEFsbWFuIEJsb2cgUG9zdF0oaHR0cDovL2JlbmFsbWFuLmNvbS9uZXdzLzIwMTAvMTEvaW1tZWRpYXRlbHktaW52b2tlZC1mdW5jdGlvbi1leHByZXNzaW9uLykgdGhhdCBjYWxscyBhbm90aGVyIElJRkUgdGhhdCBjb250YWlucyBhbGwgb2YgdG [...]
+<script src="data:application/x-javascript;base64,CgovKioKICogalF1ZXJ5IFBsdWdpbjogU3RpY2t5IFRhYnMKICoKICogQGF1dGhvciBBaWRhbiBMaXN0ZXIgPGFpZGFuQHBocC5uZXQ+CiAqIGFkYXB0ZWQgYnkgUnViZW4gQXJzbGFuIHRvIGFjdGl2YXRlIHBhcmVudCB0YWJzIHRvbwogKiBodHRwOi8vd3d3LmFpZGFubGlzdGVyLmNvbS8yMDE0LzAzL3BlcnNpc3RpbmctdGhlLXRhYi1zdGF0ZS1pbi1ib290c3RyYXAvCiAqLwooZnVuY3Rpb24oJCkgewogICJ1c2Ugc3RyaWN0IjsKICAkLmZuLnJtYXJrZG93blN0aWNreVRhYnMgPSBmdW5jdGlvbigpIHsKICAgIHZhciBjb250ZXh0ID0gdGhpczsKICAgIC8vIFNob3cgdGhlIHRhYi [...]
+<link href="data:text/css;charset=utf-8,pre%20%2Eoperator%2C%0Apre%20%2Eparen%20%7B%0Acolor%3A%20rgb%28104%2C%20118%2C%20135%29%0A%7D%0Apre%20%2Eliteral%20%7B%0Acolor%3A%20%23990073%0A%7D%0Apre%20%2Enumber%20%7B%0Acolor%3A%20%23099%3B%0A%7D%0Apre%20%2Ecomment%20%7B%0Acolor%3A%20%23998%3B%0Afont%2Dstyle%3A%20italic%0A%7D%0Apre%20%2Ekeyword%20%7B%0Acolor%3A%20%23900%3B%0Afont%2Dweight%3A%20bold%0A%7D%0Apre%20%2Eidentifier%20%7B%0Acolor%3A%20rgb%280%2C%200%2C%200%29%3B%0A%7D%0Apre%20%2Estri [...]
+<script src="data:application/x-javascript;base64,dmFyIGhsanM9bmV3IGZ1bmN0aW9uKCl7ZnVuY3Rpb24gbShwKXtyZXR1cm4gcC5yZXBsYWNlKC8mL2dtLCImYW1wOyIpLnJlcGxhY2UoLzwvZ20sIiZsdDsiKX1mdW5jdGlvbiBmKHIscSxwKXtyZXR1cm4gUmVnRXhwKHEsIm0iKyhyLmNJPyJpIjoiIikrKHA/ImciOiIiKSl9ZnVuY3Rpb24gYihyKXtmb3IodmFyIHA9MDtwPHIuY2hpbGROb2Rlcy5sZW5ndGg7cCsrKXt2YXIgcT1yLmNoaWxkTm9kZXNbcF07aWYocS5ub2RlTmFtZT09IkNPREUiKXtyZXR1cm4gcX1pZighKHEubm9kZVR5cGU9PTMmJnEubm9kZVZhbHVlLm1hdGNoKC9ccysvKSkpe2JyZWFrfX19ZnVuY3Rpb24gaCh0LH [...]
+
+<style type="text/css">code{white-space: pre;}</style>
+<style type="text/css">
+
+</style>
+<script type="text/javascript">
+if (window.hljs && document.readyState && document.readyState === "complete") {
+ window.setTimeout(function() {
+ hljs.initHighlighting();
+ }, 0);
+}
+</script>
+
+
+
+<style type="text/css">
+h1 {
+ font-size: 34px;
+}
+h1.title {
+ font-size: 38px;
+}
+h2 {
+ font-size: 30px;
+}
+h3 {
+ font-size: 24px;
+}
+h4 {
+ font-size: 18px;
+}
+h5 {
+ font-size: 16px;
+}
+h6 {
+ font-size: 12px;
+}
+.table th:not([align]) {
+ text-align: left;
+}
+</style>
+
+<link href="data:text/css;charset=utf-8,body%20%7B%0Amargin%3A%200px%20auto%3B%0Amax%2Dwidth%3A%201134px%3B%0A%7D%0Abody%2C%20td%20%7B%0Afont%2Dfamily%3A%20sans%2Dserif%3B%0Afont%2Dsize%3A%2010pt%3B%0A%7D%0A%0Adiv%23TOC%20ul%20%7B%0Apadding%3A%200px%200px%200px%2045px%3B%0Alist%2Dstyle%3A%20none%3B%0Abackground%2Dimage%3A%20none%3B%0Abackground%2Drepeat%3A%20none%3B%0Abackground%2Dposition%3A%200%3B%0Afont%2Dsize%3A%2010pt%3B%0Afont%2Dfamily%3A%20Helvetica%2C%20Arial%2C%20sans%2Dserif%3B [...]
+
+</head>
+
+<body>
+
+<style type="text/css">
+.main-container {
+ max-width: 828px;
+ margin-left: auto;
+ margin-right: auto;
+}
+
+img {
+ max-width:100%;
+ height: auto;
+}
+.tabbed-pane {
+ padding-top: 12px;
+}
+button.code-folding-btn:focus {
+ outline: none;
+}
+</style>
+
+
+
+<div class="container-fluid main-container">
+
+<!-- tabsets -->
+<script>
+$(document).ready(function () {
+ window.buildTabsets("TOC");
+});
+</script>
+
+<!-- code folding -->
+
+
+
+
+<script>
+$(document).ready(function () {
+
+ // move toc-ignore selectors from section div to header
+ $('div.section.toc-ignore')
+ .removeClass('toc-ignore')
+ .children('h1,h2,h3,h4,h5').addClass('toc-ignore');
+
+ // establish options
+ var options = {
+ selectors: "h1,h2,h3",
+ theme: "bootstrap3",
+ context: '.toc-content',
+ hashGenerator: function (text) {
+ return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_').toLowerCase();
+ },
+ ignoreSelector: ".toc-ignore",
+ scrollTo: 0
+ };
+ options.showAndHide = true;
+ options.smoothScroll = true;
+
+ // tocify
+ var toc = $("#TOC").tocify(options).data("toc-tocify");
+});
+</script>
+
+<style type="text/css">
+
+#TOC {
+ margin: 25px 0px 20px 0px;
+}
+ at media (max-width: 768px) {
+#TOC {
+ position: relative;
+ width: 100%;
+}
+}
+
+
+
+
+div.main-container {
+ max-width: 1200px;
+}
+
+div.tocify {
+ width: 20%;
+ max-width: 246px;
+ max-height: 85%;
+}
+
+ at media (min-width: 768px) and (max-width: 991px) {
+ div.tocify {
+ width: 25%;
+ }
+}
+
+ at media (max-width: 767px) {
+ div.tocify {
+ width: 100%;
+ max-width: none;
+ }
+}
+
+.tocify ul, .tocify li {
+ line-height: 20px;
+}
+
+.tocify-subheader .tocify-item {
+ font-size: 0.90em;
+ padding-left: 25px;
+ text-indent: 0;
+}
+
+.tocify .list-group-item {
+ border-radius: 0px;
+}
+
+
+</style>
+
+<!-- setup 3col/9col grid for toc_float and main content -->
+<div class="row-fluid">
+<div class="col-xs-12 col-sm-4 col-md-3">
+<div id="TOC" class="tocify">
+</div>
+</div>
+
+<div class="toc-content col-xs-12 col-sm-8 col-md-9">
+
+
+
+
+<div class="fluid-row" id="header">
+
+
+
+<h1 class="title toc-ignore">Facilities for Filtering Bioconductor Annotation Resources</h1>
+<h4 class="date"><em>24 April 2017</em></h4>
+
+</div>
+
+
+<script type="text/javascript">
+document.addEventListener("DOMContentLoaded", function() {
+ document.querySelector("h1").className = "title";
+});
+</script>
+<script type="text/javascript">
+document.addEventListener("DOMContentLoaded", function() {
+ var links = document.links;
+ for (var i = 0, linksLength = links.length; i < linksLength; i++)
+ if (links[i].hostname != window.location.hostname)
+ links[i].target = '_blank';
+});
+</script>
+<p><strong>Package</strong>: <em><a href="http://bioconductor.org/packages/AnnotationFilter">AnnotationFilter</a></em><br /> <strong>Authors</strong>: Martin Morgan [aut], Johannes Rainer [aut], Bioconductor Maintainer [cre]<br /> <strong>Last modified:</strong> 2017-04-24 16:35:20<br /> <strong>Compiled</strong>: Mon Apr 24 21:05:12 2017</p>
+<div id="introduction" class="section level1">
+<h1><span class="header-section-number">1</span> Introduction</h1>
+<p>A large variety of annotation resources are available in Bioconductor. Accessing the full content of these databases or even of single tables is computationally expensive and in many instances not required, as users may want to extract only sub-sets of the data e.g. genomic coordinates of a single gene. In that respect, filtering annotation resources before data extraction has a major impact on performance and increases the usability of such genome-scale databases.</p>
+<p>The <em><a href="http://bioconductor.org/packages/AnnotationFilter">AnnotationFilter</a></em> package was thus developed to provide basic filter classes to enable a common filtering framework for Bioconductor annotation resources. <em><a href="http://bioconductor.org/packages/AnnotationFilter">AnnotationFilter</a></em> defines filter classes for some of the most commonly used features in annotation databases, such as <em>symbol</em> or <em>genename</em>. Each filter class is supposed [...]
+</div>
+<div id="filter-classes" class="section level1">
+<h1><span class="header-section-number">2</span> Filter classes</h1>
+<p>All filter classes extend the basic <code>AnnotationFilter</code> class and take one or more <em>values</em> and a <em>condition</em> to allow filtering on a single database table column. Based on the type of the input value, filter classes are divided into:</p>
+<ul>
+<li><p><code>CharacterFilter</code>: takes a <code>character</code> value of length >= 1 and supports conditions <code>==</code>, <code>!=</code>, <code>startsWith</code> and <code>endsWith</code>. An example would be a <code>GeneIdFilter</code> that allows to filter on gene IDs.</p></li>
+<li><p><code>IntegerFilter</code>: takes a single <code>integer</code> as input and supports the conditions <code>==</code>, <code>!=</code>, <code>></code>, <code><</code>, <code>>=</code> and <code><=</code>. An example would be a <code>GeneStartFilter</code> that filters results on the (chromosomal) start coordinates of genes.</p></li>
+<li><p><code>GRangesFilter</code>: is a special filter, as it takes a <code>GRanges</code> as <code>value</code> and performs the filtering on a combination of columns (i.e. start and end coordinate as well as sequence name and strand). To be consistent with the <code>findOverlaps</code> method from the <em><a href="http://bioconductor.org/packages/IRanges">IRanges</a></em> package, the constructor of the <code>GRangesFilter</code> filter takes a <code>type</code> argument to define its [...]
+</ul>
+<p>The names of the filter classes are intuitive, the first part corresponding to the database column name with each character following a <code>_</code> being capitalized, followed by the key word <code>Filter</code>. The name of a filter for a database table column <code>gene_id</code> is thus called <code>GeneIdFilter</code>. The default database column for a filter is stored in its <code>field</code> slot (accessible <em>via</em> the <code>field</code> method).</p>
+<p>The <code>supportedFilters</code> method can be used to get an overview of all available filter objects defined in <code>AnnotationFilter</code>.</p>
+<pre class="r"><code>library(AnnotationFilter)
+supportedFilters()</code></pre>
+<pre><code>## [1] "CdsEndFilter" "CdsStartFilter" "EntrezFilter"
+## [4] "ExonEndFilter" "ExonIdFilter" "ExonNameFilter"
+## [7] "ExonRankFilter" "ExonStartFilter" "GRangesFilter"
+## [10] "GeneBiotypeFilter" "GeneEndFilter" "GeneIdFilter"
+## [13] "GeneStartFilter" "GenenameFilter" "ProteinIdFilter"
+## [16] "SeqNameFilter" "SeqStrandFilter" "SymbolFilter"
+## [19] "TxBiotypeFilter" "TxEndFilter" "TxIdFilter"
+## [22] "TxNameFilter" "TxStartFilter" "UniprotFilter"</code></pre>
+<p>Note that the <code>AnnotationFilter</code> package does provides only the filter classes but not the functionality to apply the filtering. Such functionality is annotation resource and database layout dependent and needs thus to be implemented in the packages providing access to annotation resources.</p>
+</div>
+<div id="usage" class="section level1">
+<h1><span class="header-section-number">3</span> Usage</h1>
+<p>Filters are created <em>via</em> their dedicated constructor functions, such as the <code>GeneIdFilter</code> function for the <code>GeneIdFilter</code> class. Because of this simple and cheap creation, filter classes are thought to be <em>read-only</em> and thus don’t provide <em>setter</em> methods to change their slot values. In addition to the constructor functions, <code>AnnotationFilter</code> provides the functionality to <em>translate</em> query expressions into filter classes [...]
+<p>Below we create a <code>SymbolFilter</code> that could be used to filter an annotation resource to retrieve all entries associated with the specified symbol value(s).</p>
+<pre class="r"><code>library(AnnotationFilter)
+
+smbl <- SymbolFilter("BCL2")
+smbl</code></pre>
+<pre><code>## class: SymbolFilter
+## condition: ==
+## value: BCL2</code></pre>
+<p>Such a filter is supposed to be used to retrieve all entries associated to features with a value in a database table column called <em>symbol</em> matching the filter’s value <code>"BCL2"</code>.</p>
+<p>Using the <code>"startsWith"</code> condition we could define a filter to retrieve all entries for genes with a gene name/symbol starting with the specified value (e.g. <code>"BCL2"</code> and <code>"BCL2L11"</code> for the example below.</p>
+<pre class="r"><code>smbl <- SymbolFilter("BCL2", condition = "startsWith")
+smbl</code></pre>
+<pre><code>## class: SymbolFilter
+## condition: startsWith
+## value: BCL2</code></pre>
+<p>In addition to the constructor functions, <code>AnnotationFilter</code> provides a functionality to create filter instances in a more natural and intuitive way by <em>translating</em> filter expressions (written as a <em>formula</em>, i.e. starting with a <code>~</code>).</p>
+<pre class="r"><code>smbl <- AnnotationFilter(~ symbol == "BCL2")
+smbl</code></pre>
+<pre><code>## class: SymbolFilter
+## condition: ==
+## value: BCL2</code></pre>
+<p>Individual <code>AnnotationFilter</code> objects can be combined in an <code>AnnotationFilterList</code>. This class extends <code>list</code> and provides an additional <code>logOp</code> slot that defines how its individual filters are supposed to be combined. The length of <code>logOp</code> has to be 1 less than the number of filter objects. Each element in <code>logOp</code> defines how two consecutive filters should be combined. Below we create a <code>AnnotationFilterList</code [...]
+<pre class="r"><code>flt <- AnnotationFilter(~ symbol == "BCL2" &
+ tx_biotype == "protein_coding")
+flt</code></pre>
+<pre><code>## class: AnnotationFilterList
+## length: 2
+## filters:
+##
+## class: SymbolFilter
+## condition: ==
+## value: BCL2
+##
+## &
+##
+## class: TxBiotypeFilter
+## condition: ==
+## value: protein_coding</code></pre>
+<p>Note that the <code>AnnotationFilter</code> function does not (yet) support translation of nested expressions, such as <code>(symbol == "BCL2L11" & tx_biotype == "nonsense_mediated_decay") | (symbol == "BCL2" & tx_biotype == "protein_coding")</code>. Such queries can however be build by nesting <code>AnnotationFilterList</code> classes.</p>
+<pre class="r"><code>## Define the filter query for the first pair of filters.
+afl1 <- AnnotationFilterList(SymbolFilter("BCL2L11"),
+ TxBiotypeFilter("nonsense_mediated_decay"))
+## Define the second filter pair in ( brackets should be combined.
+afl2 <- AnnotationFilterList(SymbolFilter("BCL2"),
+ TxBiotypeFilter("protein_coding"))
+## Now combine both with a logical OR
+afl <- AnnotationFilterList(afl1, afl2, logOp = "|")
+
+afl</code></pre>
+<pre><code>## class: AnnotationFilterList
+## length: 2
+## filters:
+##
+## class: AnnotationFilterList
+## length: 2
+## filters:
+##
+## class: SymbolFilter
+## condition: ==
+## value: BCL2L11
+##
+## &
+##
+## class: TxBiotypeFilter
+## condition: ==
+## value: nonsense_mediated_decay
+##
+## |
+##
+## class: AnnotationFilterList
+## length: 2
+## filters:
+##
+## class: SymbolFilter
+## condition: ==
+## value: BCL2
+##
+## &
+##
+## class: TxBiotypeFilter
+## condition: ==
+## value: protein_coding</code></pre>
+<p>This <code>AnnotationFilterList</code> would now select all entries for all transcripts of the gene <em>BCL2L11</em> with the biotype <em>nonsense_mediated_decay</em> or for all protein coding transcripts of the gene <em>BCL2</em>.</p>
+</div>
+<div id="using-annotationfilter-in-other-packages" class="section level1">
+<h1><span class="header-section-number">4</span> Using <code>AnnotationFilter</code> in other packages</h1>
+<p>The <code>AnnotationFilter</code> package does only provide filter classes, but no filtering functionality. This has to be implemented in the package using the filters. In this section we first show in a very simple example how <code>AnnotationFilter</code> classes could be used to filter a <code>data.frame</code> and subsequently explore how a simple filter framework could be implemented for a SQL based annotation resources.</p>
+<p>Let’s first define a simple <code>data.frame</code> containing the data we want to filter. Note that subsetting this <code>data.frame</code> using <code>AnnotationFilter</code> is obviously not the best solution, but it should help to understand the basic concept.</p>
+<pre class="r"><code>## Define a simple gene table
+gene <- data.frame(gene_id = 1:10,
+ symbol = c(letters[1:9], "b"),
+ seq_name = paste0("chr", c(1, 4, 4, 8, 1, 2, 5, 3, "X", 4)),
+ stringsAsFactors = FALSE)
+gene</code></pre>
+<pre><code>## gene_id symbol seq_name
+## 1 1 a chr1
+## 2 2 b chr4
+## 3 3 c chr4
+## 4 4 d chr8
+## 5 5 e chr1
+## 6 6 f chr2
+## 7 7 g chr5
+## 8 8 h chr3
+## 9 9 i chrX
+## 10 10 b chr4</code></pre>
+<p>Next we generate a <code>SymbolFilter</code> and inspect what information we can extract from it.</p>
+<pre class="r"><code>smbl <- SymbolFilter("b")</code></pre>
+<p>We can access the filter <em>condition</em> using the <code>condition</code> method</p>
+<pre class="r"><code>condition(smbl)</code></pre>
+<pre><code>## [1] "=="</code></pre>
+<p>The value of the filter using the <code>value</code> method</p>
+<pre class="r"><code>value(smbl)</code></pre>
+<pre><code>## [1] "b"</code></pre>
+<p>And finally the <em>field</em> (i.e. column in the data table) using the <code>field</code> method.</p>
+<pre class="r"><code>field(smbl)</code></pre>
+<pre><code>## [1] "symbol"</code></pre>
+<p>With this information we can define a simple function that takes the data table and the filter as input and returns a <code>logical</code> with length equal to the number of rows of the table, <code>TRUE</code> for rows matching the filter.</p>
+<pre class="r"><code>doMatch <- function(x, filter) {
+ do.call(condition(filter), list(x[, field(filter)], value(filter)))
+}
+
+## Apply this function
+doMatch(gene, smbl)</code></pre>
+<pre><code>## [1] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE</code></pre>
+<p>Note that this simple function does not support multiple filters and also not conditions <code>"startsWith"</code> or <code>"endsWith"</code>. Next we define a second function that extracts the relevant data from the data resource.</p>
+<pre class="r"><code>doExtract <- function(x, filter) {
+ x[doMatch(x, filter), ]
+}
+
+## Apply it on the data
+doExtract(gene, smbl)</code></pre>
+<pre><code>## gene_id symbol seq_name
+## 2 2 b chr4
+## 10 10 b chr4</code></pre>
+<p>We could even modify the <code>doMatch</code> function to enable filter expressions.</p>
+<pre class="r"><code>doMatch <- function(x, filter) {
+ if (is(filter, "formula"))
+ filter <- AnnotationFilter(filter)
+ do.call(condition(filter), list(x[, field(filter)], value(filter)))
+}
+
+doExtract(gene, ~ gene_id == '2')</code></pre>
+<pre><code>## gene_id symbol seq_name
+## 2 2 b chr4</code></pre>
+<p>For such simple examples <code>AnnotationFilter</code> might be an overkill as the same could be achieved (much simpler) using standard R operations. A real case scenario in which <code>AnnotationFilter</code> becomes useful are SQL-based annotation resources. We will thus explore next how SQL resources could be filtered using <code>AnnotationFilter</code>.</p>
+<p>We use the SQLite database from the <em><a href="http://bioconductor.org/packages/org.Hs.eg.db">org.Hs.eg.db</a></em> package that provides a variety of annotations for all human genes. Using the packages’ connection to the database we inspect first what database tables are available and then select one for our simple filtering example.</p>
+<p>We use an <code>EnsDb</code> SQLite database used by the <em><a href="http://bioconductor.org/packages/ensembldb">ensembldb</a></em> package and implement simple filter functions to extract specific data from one of its database tables. We thus load below the <code>EnsDb.Hsapiens.v75</code> package that provides access to human gene, transcript, exon and protein annotations. Using its connection to the database we inspect first what database tables are available and then what <em>fiel [...]
+<pre class="r"><code>## Load the required packages
+library(org.Hs.eg.db)
+library(RSQLite)
+## Get the database connection
+dbcon <- org.Hs.eg_dbconn()
+
+## What tables do we have?
+dbListTables(dbcon)</code></pre>
+<pre><code>## [1] "accessions" "alias" "chrlengths"
+## [4] "chromosome_locations" "chromosomes" "cytogenetic_locations"
+## [7] "ec" "ensembl" "ensembl2ncbi"
+## [10] "ensembl_prot" "ensembl_trans" "gene_info"
+## [13] "genes" "go" "go_all"
+## [16] "go_bp" "go_bp_all" "go_cc"
+## [19] "go_cc_all" "go_mf" "go_mf_all"
+## [22] "kegg" "map_counts" "map_metadata"
+## [25] "metadata" "ncbi2ensembl" "omim"
+## [28] "pfam" "prosite" "pubmed"
+## [31] "refseq" "sqlite_stat1" "ucsc"
+## [34] "unigene" "uniprot"</code></pre>
+<p><code>org.Hs.eg.db</code> provides many different tables, one for each identifier or annotation resource. We will use the <em>gene_info</em> table and determine which <em>fields</em> (i.e. columns) the table provides.</p>
+<pre class="r"><code>## What fields are there in the gene_info table?
+dbListFields(dbcon, "gene_info")</code></pre>
+<pre><code>## [1] "_id" "gene_name" "symbol"</code></pre>
+<p>The <em>gene_info</em> table provides the official gene symbol and the gene name. The column <em>symbol</em> matches the default <code>field</code> value of the <code>SymbolFilter</code>. For the <code>GenenameFilter</code> we would have to re-map its default field <code>"genename"</code> to the database column <em>gene_name</em>. There are many possibilities to do this, one would be to implement an own function to extract the field from the <code>AnnotationFilter</code> cla [...]
+<p>We next implement a simple <code>doExtractGene</code> function that retrieves data from the <em>gene_info</em> table and re-uses the <code>doFilter</code> function to extract specific data. The parameter <code>x</code> is now the database connection object.</p>
+<pre class="r"><code>doExtractGene <- function(x, filter) {
+ gene <- dbGetQuery(x, "select * from gene_info")
+ doExtract(gene, filter)
+}
+
+## Extract all entries for BCL2
+bcl2 <- doExtractGene(dbcon, SymbolFilter("BCL2"))
+
+bcl2</code></pre>
+<pre><code>## _id gene_name symbol
+## 487 487 BCL2, apoptosis regulator BCL2</code></pre>
+<p>This works, but is not really efficient, since the function first fetches the full database table and subsets it only afterwards. A much more efficient solution is to <em>translate</em> the <code>AnnotationFilter</code> class(es) to an SQL <em>where</em> condition and hence perform the filtering on the database level. Here we have to do some small modifications, since not all condition values can be used 1:1 in SQL calls. The condition <code>"=="</code> has for example to be [...]
+<pre class="r"><code>## Define a simple function that covers some condition conversion
+conditionForSQL <- function(x) {
+ switch(x,
+ "==" = "=",
+ x)
+}
+
+## Define a function to translate a filter into an SQL where condition.
+## Character values have to be quoted.
+where <- function(x) {
+ if (is(x, "CharacterFilter"))
+ value <- paste0("'", value(x), "'")
+ else value <- value(x)
+ paste0(field(x), conditionForSQL(condition(x)), value)
+}
+
+## Now "translate" a filter using this function
+where(SeqNameFilter("Y"))</code></pre>
+<pre><code>## [1] "seq_name='Y'"</code></pre>
+<p>Next we implement a new function which integrates the filter into the SQL call to let the database server take care of the filtering.</p>
+<pre class="r"><code>## Define a function that
+doExtractGene2 <- function(x, filter) {
+ if (is(filter, "formula"))
+ filter <- AnnotationFilter(filter)
+ query <- paste0("select * from gene_info where ", where(filter))
+ dbGetQuery(x, query)
+}
+
+bcl2 <- doExtractGene2(dbcon, ~ symbol == "BCL2")
+bcl2</code></pre>
+<pre><code>## _id gene_name symbol
+## 1 487 BCL2, apoptosis regulator BCL2</code></pre>
+<p>Below we compare the performance of both approaches.</p>
+<pre class="r"><code>system.time(doExtractGene(dbcon, ~ symbol == "BCL2"))</code></pre>
+<pre><code>## user system elapsed
+## 0.100 0.000 0.099</code></pre>
+<pre class="r"><code>system.time(doExtractGene2(dbcon, ~ symbol == "BCL2"))</code></pre>
+<pre><code>## user system elapsed
+## 0.016 0.000 0.013</code></pre>
+<p>Not surprisingly, the second approach is much faster.</p>
+<p>Be aware that the examples shown here are only for illustration purposes. In a real world situation additional factors, like combinations of filters, which database tables to join, which columns to be returned etc would have to be considered too.</p>
+</div>
+<div id="session-information" class="section level1">
+<h1><span class="header-section-number">5</span> Session information</h1>
+<pre class="r"><code>sessionInfo()</code></pre>
+<pre><code>## R version 3.4.0 (2017-04-21)
+## Platform: x86_64-pc-linux-gnu (64-bit)
+## Running under: Ubuntu 16.04.2 LTS
+##
+## Matrix products: default
+## BLAS: /home/biocbuild/bbs-3.5-bioc/R/lib/libRblas.so
+## LAPACK: /home/biocbuild/bbs-3.5-bioc/R/lib/libRlapack.so
+##
+## locale:
+## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
+## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=C
+## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
+## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
+## [9] LC_ADDRESS=C LC_TELEPHONE=C
+## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
+##
+## attached base packages:
+## [1] parallel stats4 stats graphics grDevices utils datasets
+## [8] methods base
+##
+## other attached packages:
+## [1] RSQLite_1.1-2 org.Hs.eg.db_3.4.1 AnnotationDbi_1.38.0
+## [4] IRanges_2.10.0 S4Vectors_0.14.0 Biobase_2.36.0
+## [7] BiocGenerics_0.22.0 AnnotationFilter_1.0.0 BiocStyle_2.4.0
+##
+## loaded via a namespace (and not attached):
+## [1] Rcpp_0.12.10 knitr_1.15.1 XVector_0.16.0
+## [4] magrittr_1.5 GenomicRanges_1.28.0 zlibbioc_1.22.0
+## [7] stringr_1.2.0 GenomeInfoDb_1.12.0 tools_3.4.0
+## [10] DBI_0.6-1 htmltools_0.3.5 lazyeval_0.2.0
+## [13] yaml_2.1.14 rprojroot_1.2 digest_0.6.12
+## [16] bookdown_0.3 GenomeInfoDbData_0.99.0 bitops_1.0-6
+## [19] RCurl_1.95-4.8 memoise_1.1.0 evaluate_0.10
+## [22] rmarkdown_1.4 stringi_1.1.5 compiler_3.4.0
+## [25] backports_1.0.5</code></pre>
+</div>
+
+
+
+</div>
+</div>
+
+</div>
+
+<script>
+
+// add bootstrap table styles to pandoc tables
+function bootstrapStylePandocTables() {
+ $('tr.header').parent('thead').parent('table').addClass('table table-condensed');
+}
+$(document).ready(function () {
+ bootstrapStylePandocTables();
+});
+
+
+</script>
+
+<script type="text/x-mathjax-config">
+ MathJax.Hub.Config({
+ "HTML-CSS": {
+ styles: {
+ ".MathJax_Display": {
+ "text-align": "center",
+ padding: "0px 150px 0px 65px",
+ margin: "0px 0px 0.5em"
+ },
+ }
+ }
+ });
+</script>
+<!-- dynamically load mathjax for compatibility with self-contained -->
+<script>
+ (function () {
+ var script = document.createElement("script");
+ script.type = "text/javascript";
+ script.src = "https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
+ document.getElementsByTagName("head")[0].appendChild(script);
+ })();
+</script>
+
+</body>
+</html>
diff --git a/man/AnnotationFilter.Rd b/man/AnnotationFilter.Rd
new file mode 100644
index 0000000..f7492d9
--- /dev/null
+++ b/man/AnnotationFilter.Rd
@@ -0,0 +1,230 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/AnnotationFilter.R, R/translate-utils.R
+\docType{methods}
+\name{AnnotationFilter}
+\alias{AnnotationFilter}
+\alias{CdsStartFilter}
+\alias{CdsEndFilter}
+\alias{ExonIdFilter}
+\alias{ExonNameFilter}
+\alias{ExonStartFilter}
+\alias{ExonEndFilter}
+\alias{ExonRankFilter}
+\alias{GeneIdFilter}
+\alias{GenenameFilter}
+\alias{GeneBiotypeFilter}
+\alias{GeneStartFilter}
+\alias{GeneEndFilter}
+\alias{EntrezFilter}
+\alias{SymbolFilter}
+\alias{TxIdFilter}
+\alias{TxNameFilter}
+\alias{TxBiotypeFilter}
+\alias{TxStartFilter}
+\alias{TxEndFilter}
+\alias{ProteinIdFilter}
+\alias{UniprotFilter}
+\alias{SeqNameFilter}
+\alias{SeqStrandFilter}
+\alias{AnnotationFilter-class}
+\alias{CharacterFilter-class}
+\alias{IntegerFilter-class}
+\alias{CdsStartFilter-class}
+\alias{CdsEndFilter-class}
+\alias{ExonIdFilter-class}
+\alias{ExonNameFilter-class}
+\alias{ExonStartFilter-class}
+\alias{ExonEndFilter-class}
+\alias{ExonRankFilter-class}
+\alias{GeneIdFilter-class}
+\alias{GenenameFilter-class}
+\alias{GeneBiotypeFilter-class}
+\alias{GeneStartFilter-class}
+\alias{GeneEndFilter-class}
+\alias{EntrezFilter-class}
+\alias{SymbolFilter-class}
+\alias{TxIdFilter-class}
+\alias{TxNameFilter-class}
+\alias{TxBiotypeFilter-class}
+\alias{TxStartFilter-class}
+\alias{TxEndFilter-class}
+\alias{ProteinIdFilter-class}
+\alias{UniprotFilter-class}
+\alias{SeqNameFilter-class}
+\alias{SeqStrandFilter-class}
+\alias{supportedFilters}
+\alias{show,AnnotationFilter-method}
+\alias{show,CharacterFilter-method}
+\alias{show,IntegerFilter-method}
+\alias{show,GRangesFilter-method}
+\alias{condition,AnnotationFilter-method}
+\alias{condition}
+\alias{value,AnnotationFilter-method}
+\alias{value}
+\alias{field,AnnotationFilter-method}
+\alias{field}
+\alias{GRangesFilter-class}
+\alias{.GRangesFilter}
+\alias{GRangesFilter}
+\alias{feature}
+\alias{AnnotationFilter}
+\alias{supportedFilters,missing-method}
+\alias{AnnotationFilter}
+\title{Filters for annotation objects}
+\usage{
+CdsStartFilter(value, condition = "==")
+CdsEndFilter(value, condition = "==")
+ExonIdFilter(value, condition = "==")
+ExonNameFilter(value, condition = "==")
+ExonRankFilter(value, condition = "==")
+ExonStartFilter(value, condition = "==")
+ExonEndFilter(value, condition = "==")
+GeneIdFilter(value, condition = "==")
+GenenameFilter(value, condition = "==")
+GeneBiotypeFilter(value, condition = "==")
+GeneStartFilter(value, condition = "==")
+GeneEndFilter(value, condition = "==")
+EntrezFilter(value, condition = "==")
+SymbolFilter(value, condition = "==")
+TxIdFilter(value, condition = "==")
+TxNameFilter(value, condition = "==")
+TxBiotypeFilter(value, condition = "==")
+TxStartFilter(value, condition = "==")
+TxEndFilter(value, condition = "==")
+ProteinIdFilter(value, condition = "==")
+UniprotFilter(value, condition = "==")
+SeqNameFilter(value, condition = "==")
+SeqStrandFilter(value, condition = "==")
+
+\S4method{condition}{AnnotationFilter}(object)
+
+\S4method{value}{AnnotationFilter}(object)
+
+\S4method{field}{AnnotationFilter}(object)
+
+GRangesFilter(value, feature = "gene", type = c("any", "start", "end",
+ "within", "equal"))
+
+feature(object)
+
+\S4method{supportedFilters}{missing}(object)
+
+AnnotationFilter(expr)
+}
+\arguments{
+\item{object}{An \code{AnnotationFilter} object.}
+
+\item{value}{\code{character()}, \code{integer()}, or
+\code{GRanges()} value for the filter}
+
+\item{feature}{\code{character(1)} defining on what feature the
+\code{GRangesFilter} should be applied. Choices could be
+\code{"gene"}, \code{"tx"} or \code{"exon"}.}
+
+\item{type}{\code{character(1)} indicating how overlaps are to be
+filtered. See \code{findOverlaps} in the IRanges package for a
+description of this argument.}
+
+\item{expr}{A filter expression, written as a \code{formula}, to be
+converted to an \code{AnnotationFilter} or \code{AnnotationFilterList}
+class. See below for examples.}
+
+\item{condition}{\code{character(1)} defining the condition to be
+used in the filter. For \code{IntegerFilter}, one of
+\code{"=="}, \code{"!="}, \code{">"}, \code{"<"}, \code{">="}
+or \code{"<="}. For \code{CharacterFilter}, one of \code{"=="},
+\code{"!="}, \code{"startsWith"} or \code{"endsWith"}. Default
+condition is \code{"=="}.}
+}
+\value{
+The constructor function return an object extending
+ \code{AnnotationFilter}. For the return value of the other methods see
+ the methods' descriptions.
+
+\code{AnnotationFilter} returns an
+ \code{\link{AnnotationFilter}} or an \code{\link{AnnotationFilterList}}.
+}
+\description{
+The filters extending the base \code{AnnotationFilter} class
+represent a simple filtering concept for annotation resources.
+Each filter object is thought to filter on a single (database)
+table column using the provided values and the defined condition.
+
+Filter instances created using the constructor functions (e.g.
+\code{GeneIdFilter}).
+
+\code{supportedFilters()} lists all defined filters. Packages using
+\code{AnnotationFilter} should implement the \code{supportedFilters} for
+their annotation resource object (e.g. for \code{object = "EnsDb"} in the
+\code{ensembldb} package) to list all supported filters for the specific
+resource.
+
+\code{condition()} get the \code{condition} value for
+ the filter \code{object}.
+
+\code{value()} get the \code{value} for the filter
+ \code{object}.
+
+\code{field()} get the \code{field} for the filter
+ \code{object}.
+
+\code{feature()} get the \code{feature} for the
+ \code{GRangesFilter} \code{object}.
+
+\code{AnnotationFilter} \emph{translates} a filter
+ expression such as \code{~ gene_id == "BCL2"} into a filter object
+ extending the \code{\link{AnnotationFilter}} class (in the example a
+ \code{\link{GeneIdFilter}} object) or an
+ \code{\link{AnnotationFilterList}} if the expression contains multiple
+ conditions (see examples below).
+}
+\details{
+By default filters are only available for tables containing the
+field on which the filter acts (i.e. that contain a column with the
+name matching the value of the \code{field} slot of the
+object). See the vignette for a description to use filters for
+databases in which the database table column name differs from the
+default \code{field} of the filter.
+
+Filter expressions for the \code{AnnotationFilter} class have to be
+ written as formulas, i.e. starting with a \code{~}.
+}
+\note{
+Translation of nested filter expressions using the
+ \code{AnnotationFilter} function is not yet supported.
+}
+\examples{
+## filter by GRanges
+GRangesFilter(GenomicRanges::GRanges("chr10:87869000-87876000"))
+## Create a SymbolFilter to filter on a gene's symbol.
+sf <- SymbolFilter("BCL2")
+sf
+
+## Create a GeneStartFilter to filter based on the genes' chromosomal start
+## coordinates
+gsf <- GeneStartFilter(10000, condition = ">")
+gsf
+
+supportedFilters()
+
+## Convert a filter expression based on a gene ID to a GeneIdFilter
+gnf <- AnnotationFilter(~ gene_id == "BCL2")
+gnf
+
+## Same conversion but for two gene IDs.
+gnf <- AnnotationFilter(~ gene_id \%in\% c("BCL2", "BCL2L11"))
+gnf
+
+## Converting an expression that combines multiple filters. As a result we
+## get an AnnotationFilterList containing the corresponding filters.
+## Be aware that nesting of expressions/filters does not work.
+flt <- AnnotationFilter(~ gene_id \%in\% c("BCL2", "BCL2L11") &
+ tx_biotype == "nonsense_mediated_decay" |
+ seq_name == "Y")
+flt
+
+}
+\seealso{
+\code{\link{AnnotationFilterList}} for combining
+ \code{AnnotationFilter} objects.
+}
diff --git a/man/AnnotationFilterList.Rd b/man/AnnotationFilterList.Rd
new file mode 100644
index 0000000..0f95296
--- /dev/null
+++ b/man/AnnotationFilterList.Rd
@@ -0,0 +1,82 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/AnnotationFilterList.R
+\docType{methods}
+\name{AnnotationFilterList}
+\alias{AnnotationFilterList}
+\alias{AnnotationFilterList-class}
+\alias{AnnotationFilterList}
+\alias{value,AnnotationFilterList-method}
+\alias{show,AnnotationFilterList-method}
+\title{Combining annotation filters}
+\usage{
+AnnotationFilterList(..., logOp = character())
+
+\S4method{value}{AnnotationFilterList}(object)
+
+\S4method{show}{AnnotationFilterList}(object)
+}
+\arguments{
+\item{...}{individual \code{\link{AnnotationFilter}} objects or a
+mixture of \code{AnnotationFilter} and
+\code{AnnotationFilterList} objects.}
+
+\item{logOp}{\code{character} of length being equal to the numner
+of submitted \code{AnnotationFilter} objects -1. Each value
+representing the logical operation to combine consecutive
+filters, i.e. the first element being the logical operation to
+combine the first and second \code{AnnotationFilter}, the
+second element being the logical operation to combine the
+second and third \code{AnnotationFilter} and so on. Allowed
+values are \code{"&"} and \code{"|"}. The function assumes a
+logical \emph{and} between all elements by default.}
+
+\item{object}{An object of class \code{AnnotationFilterList}.}
+}
+\value{
+\code{AnnotationFilterList} returns an \code{AnnotationFilterList}.
+
+\code{value} returns a \code{list} with \code{AnnotationFilter}
+ objects.
+}
+\description{
+The \code{AnnotationFilterList} allows to combine
+ filter objects extending the \code{\link{AnnotationFilter}}
+ class to construct more complex queries. Consecutive filter
+ objects in the \code{AnnotationFilterList} can be combined by a
+ logical \emph{and} (\code{&}) or \emph{or} (\code{|}). The
+ \code{AnnotationFilterList} extends \code{list}, individual
+ elements can thus be accessed with \code{[[}.
+
+\code{value()} get a \code{list} with the
+ \code{AnnotationFilter} objects. Use \code{[[} to access
+ individual filters.
+}
+\examples{
+## Create some AnnotationFilters
+gf <- GenenameFilter(c("BCL2", "BCL2L11"))
+tbtf <- TxBiotypeFilter("protein_coding", condition = "!=")
+
+## Combine both to an AnnotationFilterList. By default elements are combined
+## using a logical "and" operator. The filter list represents thus a query
+## like: get all features where the gene name is either ("BCL2" or "BCL2L11")
+## and the transcript biotype is not "protein_coding".
+afl <- AnnotationFilterList(gf, tbtf)
+afl
+
+## Access individual filters.
+afl[[1]]
+
+## Create a filter in the form of: get all features where the gene name is
+## either ("BCL2" or "BCL2L11") and the transcript biotype is not
+## "protein_coding" or the seq_name is "Y". Hence, this will get all feature
+## also found by the previous AnnotationFilterList and returns also all
+## features on chromosome Y.
+afl <- AnnotationFilterList(gf, tbtf, SeqNameFilter("Y"),
+ logOp = c("&", "|"))
+afl
+
+}
+\seealso{
+\code{\link{supportedFilters}} for available
+ \code{\link{AnnotationFilter}} objects
+}
diff --git a/tests/testthat.R b/tests/testthat.R
new file mode 100644
index 0000000..dd1cbb3
--- /dev/null
+++ b/tests/testthat.R
@@ -0,0 +1,4 @@
+library(testthat)
+library(AnnotationFilter)
+
+test_check("AnnotationFilter")
diff --git a/tests/testthat/test_AnnotationFilter.R b/tests/testthat/test_AnnotationFilter.R
new file mode 100644
index 0000000..400cac8
--- /dev/null
+++ b/tests/testthat/test_AnnotationFilter.R
@@ -0,0 +1,75 @@
+context("AnnotationFilter")
+
+test_that("supportedFilters() works", {
+ expect_true(inherits(supportedFilters(), "character"))
+ expect_identical(
+ length(supportedFilters()),
+ length(unlist(AnnotationFilter:::.FIELD, use.names=FALSE)) + 1L
+ )
+})
+
+test_that("SymbolFilter as representative for character filters", {
+ expect_true(validObject(new("SymbolFilter")))
+ expect_error(SymbolFilter())
+ expect_error(SymbolFilter(1, ">"))
+ expect_error(SymbolFilter(1, "foo"))
+ expect_error(SymbolFilter(c("foo","bar"), "startsWith"))
+ ## Getter / setter
+ fl <- SymbolFilter("BCL2")
+ expect_equal(value(fl), "BCL2")
+ fl <- SymbolFilter(c(4, 5))
+ expect_equal(value(fl), c("4", "5"))
+ fl <- SymbolFilter(3)
+ expect_equal(value(fl), "3")
+ expect_error(SymbolFilter(NA))
+ ## condition.
+ expect_equal(condition(fl), "==")
+ fl <- SymbolFilter("a", condition = "!=")
+ expect_equal(condition(fl), "!=")
+ expect_error(SymbolFilter("a", condition = "<"))
+ expect_error(SymbolFilter("a", condition = ""))
+ expect_error(SymbolFilter("a", condition = c("==", ">")))
+ expect_error(SymbolFilter("a", condition = NULL))
+ expect_error(SymbolFilter("a", condition = NA))
+ expect_error(SymbolFilter("a", condition = 4))
+})
+
+test_that("GeneStartFilter as representative for integer filters", {
+ gsf <- GeneStartFilter(10000, condition = ">")
+ expect_equal(condition(gsf), ">")
+ expect_error(GeneStartFilter("3"))
+ expect_error(GeneStartFilter("B"))
+ expect_error(GeneStartFilter(NA))
+ expect_error(GeneStartFilter(NULL))
+ expect_error(GeneStartFilter())
+ ## Condition
+ expect_error(GeneStartFilter(10000, condition = "startsWith"))
+ expect_error(GeneStartFilter(10000, condition = "endsWith"))
+ expect_error(GeneStartFilter(10000, condition = c("==", "<")))
+})
+
+test_that("GRangesFilter works", {
+ GRanges <- GenomicRanges::GRanges
+ grf <- GRangesFilter(GRanges("chr10:87869000-87876000"))
+ expect_equal(condition(grf), "any")
+ expect_error(GRangesFilter(value = 3))
+ expect_error(GRangesFilter(
+ GRanges("chr10:87869000-87876000"),
+ type = "=="
+ ))
+ grf <- GRangesFilter(
+ GRanges("chr10:87869000-87876000"),
+ type = "within",
+ feature = "tx"
+ )
+ expect_equal(condition(grf), "within")
+ expect_equal(feature(grf), "tx")
+})
+
+test_that("fieldToClass works", {
+ expect_identical(AnnotationFilter:::.fieldToClass("gene_id"),
+ "GeneIdFilter")
+ ## Support replacement for multiple _ : issue #13
+ expect_identical(AnnotationFilter:::.fieldToClass("gene_seq_start"),
+ "GeneSeqStartFilter")
+})
diff --git a/tests/testthat/test_AnnotationFilterList.R b/tests/testthat/test_AnnotationFilterList.R
new file mode 100644
index 0000000..c56c904
--- /dev/null
+++ b/tests/testthat/test_AnnotationFilterList.R
@@ -0,0 +1,52 @@
+context("AnnotationFilterList")
+
+test_that("AnnotationFilterList() works", {
+ logOp <- AnnotationFilter:::.logOp
+ f1 <- GeneIdFilter("somegene")
+ f2 <- SeqNameFilter("chr3")
+ f3 <- GeneBiotypeFilter("protein_coding", "!=")
+
+ fL <- AnnotationFilter:::AnnotationFilterList(f1, f2)
+ expect_true(length(fL) == 2)
+ expect_equal(fL[[1]], f1)
+ expect_equal(fL[[2]], f2)
+ expect_true(all(logOp(fL) == "&"))
+
+ fL <- AnnotationFilter:::AnnotationFilterList(f1, f2, f3,
+ logOp = c("&", "|"))
+ expect_true(length(fL) == 3)
+ expect_equal(fL[[1]], f1)
+ expect_equal(fL[[2]], f2)
+ expect_equal(fL[[3]], f3)
+ expect_equal(fL at logOp, c("&", "|"))
+
+ ## A AnnotationFilterList with and AnnotationFilterList
+ fL <- AnnotationFilter:::AnnotationFilterList(f1, f2, logOp = "|")
+ fL2 <- AnnotationFilter:::AnnotationFilterList(f3, fL, logOp = "&")
+ expect_true(length(fL) == 2)
+ expect_true(length(fL2) == 2)
+ expect_true(is(value(fL2)[[1]], "GeneBiotypeFilter"))
+ expect_true(is(value(fL2)[[2]], "AnnotationFilterList"))
+ expect_equal(value(fL2)[[2]], fL)
+ expect_equal(fL2[[2]], fL)
+ expect_equal(logOp(fL2), c("&"))
+ expect_equal(logOp(fL2[[2]]), c("|"))
+})
+
+test_that("empty elements in AnnotationFilterList", {
+ ## empty elements should be removed from the AnnotationFilterList.
+ empty_afl <- AnnotationFilterList()
+ afl <- AnnotationFilterList(empty_afl)
+ expect_true(length(afl) == 0)
+ afl <- AnnotationFilterList(GeneIdFilter(4), empty_afl)
+ expect_true(length(afl) == 1)
+ afl <- AnnotationFilterList(GeneIdFilter(4),
+ AnnotationFilter(~ gene_id == 3 | seq_name == 4),
+ empty_afl)
+ expect_true(length(afl) == 2)
+ ## Check validate.
+ afl at .Data <- c(afl at .Data, list(empty_afl))
+ ## Fix also the logOp.
+ afl at logOp <- c(afl at logOp, "|")
+ expect_error(validObject(afl))
+})
diff --git a/tests/testthat/test_translate-utils.R b/tests/testthat/test_translate-utils.R
new file mode 100644
index 0000000..0fd31c0
--- /dev/null
+++ b/tests/testthat/test_translate-utils.R
@@ -0,0 +1,108 @@
+context("expression translation")
+
+test_that("translation of expression works for single filter/condition", {
+ ## Check for some character filter.
+ ## exon_id
+ flt <- ExonIdFilter("EX1", condition = "==")
+ flt2 <- AnnotationFilter(~ exon_id == "EX1")
+ expect_equal(flt, flt2)
+ flt <- ExonIdFilter(c("EX1", "EX2"), condition = "!=")
+ flt2 <- AnnotationFilter(~ exon_id != c("EX1", "EX2"))
+ expect_equal(flt, flt2)
+ ## seq_name
+ flt <- SeqNameFilter(c("chr3", "chrX"), condition = "==")
+ flt2 <- AnnotationFilter(~ seq_name == c("chr3", "chrX"))
+ expect_equal(flt, flt2)
+ flt <- SeqNameFilter(1:3, condition = "==")
+ flt2 <- AnnotationFilter(~ seq_name %in% 1:3)
+ expect_equal(flt, flt2)
+ ## Check IntegerFilter
+ flt <- GeneStartFilter(123, condition = ">")
+ flt2 <- AnnotationFilter(~ gene_start > 123)
+ expect_equal(flt, flt2)
+ flt <- TxStartFilter(123, condition = "<")
+ flt2 <- AnnotationFilter(~ tx_start < 123)
+ expect_equal(flt, flt2)
+ flt <- GeneEndFilter(123, condition = ">=")
+ flt2 <- AnnotationFilter(~ gene_end >= 123)
+ expect_equal(flt, flt2)
+ flt <- ExonEndFilter(123, condition = "<=")
+ flt2 <- AnnotationFilter(~ exon_end <= 123)
+ expect_equal(flt, flt2)
+ ## Test exceptions/errors.
+ expect_error(AnnotationFilter(~ not_existing == 1:3))
+ ## Throws an error, but is not self-explanatory.
+ expect_error(AnnotationFilter(~ gene_id * 3))
+})
+
+test_that("translation of combined expressions works", {
+ res <- AnnotationFilter(~ exon_id == "EX1" & genename == "BCL2")
+ cmp <- AnnotationFilterList(ExonIdFilter("EX1"), GenenameFilter("BCL2"))
+ expect_equal(res, cmp)
+ res <- AnnotationFilter(~ exon_id == "EX1" | genename != "BCL2")
+ cmp <- AnnotationFilterList(ExonIdFilter("EX1"),
+ GenenameFilter("BCL2", "!="), logOp = "|")
+ expect_equal(res, cmp)
+ ## 3 filters.
+ res <- AnnotationFilter(~ exon_id == "EX1" & genename == "BCL2" |
+ seq_name != 3)
+ ## Expect an AnnotationFilterList of length 3.
+ expect_equal(length(res), 3)
+ cmp <- AnnotationFilterList(ExonIdFilter("EX1"), GenenameFilter("BCL2"),
+ SeqNameFilter(3, "!="), logOp = c("&", "|"))
+ expect_equal(res, cmp)
+ ## 4 filters.
+ res <- AnnotationFilter(~ exon_id == "EX1" & genename == "BCL2" |
+ seq_name != 3 | seq_name == "Y")
+ expect_equal(length(res), 4)
+ cmp <- AnnotationFilterList(ExonIdFilter("EX1"), GenenameFilter("BCL2"),
+ SeqNameFilter(3, "!="), SeqNameFilter("Y"),
+ logOp = c("&", "|", "|"))
+ expect_equal(res, cmp)
+})
+
+test_that("translation works from within other functions", {
+ simpleFun <- function(x)
+ AnnotationFilter(x)
+ expect_equal(simpleFun(~ gene_id == 4), AnnotationFilter(~ gene_id == 4))
+ filter_expr <- ~ gene_id == 4
+ expect_equal(simpleFun(filter_expr),
+ AnnotationFilter(~ gene_id == 4))
+})
+
+## This might be a test if we get the nesting working.
+## test_that("translation of nested expressions works" {
+## res <- convertFilterExpression((exon_id == "EX1" & gene_id == "BCL2") |
+## (exon_id == "EX3" & gene_id == "BCL2L11"))
+## expect_equal(logOp(res), "|")
+## expect_true(is(res[[1]], "AnnotationFilterList"))
+## expect_equal(res[[1]][[1]], ExonIdFilter("EX1"))
+## expect_equal(res[[1]][[2]], GeneIdFilter("BCL2"))
+## expect_equal(logOp(res[[1]]), "&")
+## expect_true(is(res[[2]], "AnnotationFilterList"))
+## expect_equal(res[[2]][[1]], ExonIdFilter("EX3"))
+## expect_equal(res[[2]][[2]], GeneIdFilter("BCL2L11"))
+## expect_equal(logOp(res[[2]]), "&")
+## ##
+## res <- convertFilterExpression(seq_name == "Y" |
+## (exon_id == "EX1" & gene_id == "BCL2") &
+## (exon_id == "EX3" & gene_id == "BCL2L11"))
+## ## Expect: length 3, first being a SeqNameFilter, second an
+## ## AnnotationFilterList, third a AnnotationFilterList.
+## expect_equal(res[[1]], SeqNameFilter("Y"))
+## expect_equal(logOp(res), "|")
+## expect_true(is(res[[2]], "AnnotationFilterList"))
+## expect_equal(res[[1]][[1]], ExonIdFilter("EX1"))
+## expect_equal(res[[1]][[2]], GeneIdFilter("BCL2"))
+## expect_equal(logOp(res[[1]]), "&")
+## expect_true(is(res[[2]], "AnnotationFilterList"))
+## expect_equal(res[[2]][[1]], ExonIdFilter("EX3"))
+## expect_equal(res[[2]][[2]], GeneIdFilter("BCL2L11"))
+## expect_equal(logOp(res[[2]]), "&")
+
+## expect_true(is(res[[1]], "AnnotationFilterList"))
+## expect_true(is(res[[2]], "AnnotationFilterList"))
+
+## convertFilterExpression((gene_id == 3) ()
+## })
+
diff --git a/vignettes/AnnotationFilter.Rmd b/vignettes/AnnotationFilter.Rmd
new file mode 100644
index 0000000..0dddd45
--- /dev/null
+++ b/vignettes/AnnotationFilter.Rmd
@@ -0,0 +1,404 @@
+---
+title: "Facilities for Filtering Bioconductor Annotation Resources"
+output:
+ BiocStyle::html_document2:
+ toc_float: true
+vignette: >
+ %\VignetteIndexEntry{Facilities for Filtering Bioconductor Annotation resources}
+ %\VignetteEngine{knitr::rmarkdown}
+ %\VignetteEncoding{UTF-8}
+ %\VignettePackage{AnnotationFilter}
+ %\VignetteDepends{org.Hs.eg.db,BiocStyle,RSQLite}
+---
+
+```{r style, echo = FALSE, results = 'asis', message=FALSE}
+BiocStyle::markdown()
+```
+
+**Package**: `r Biocpkg("AnnotationFilter")`<br />
+**Authors**: `r packageDescription("AnnotationFilter")[["Author"]] `<br />
+**Last modified:** `r file.info("AnnotationFilter.Rmd")$mtime`<br />
+**Compiled**: `r date()`
+
+
+# Introduction
+
+A large variety of annotation resources are available in Bioconductor. Accessing
+the full content of these databases or even of single tables is computationally
+expensive and in many instances not required, as users may want to extract only
+sub-sets of the data e.g. genomic coordinates of a single gene. In that respect,
+filtering annotation resources before data extraction has a major impact on
+performance and increases the usability of such genome-scale databases.
+
+The `r Biocpkg("AnnotationFilter")` package was thus developed to provide basic
+filter classes to enable a common filtering framework for Bioconductor
+annotation resources. `r Biocpkg("AnnotationFilter")` defines filter classes for
+some of the most commonly used features in annotation databases, such as
+*symbol* or *genename*. Each filter class is supposed to work on a single
+database table column and to facilitate filtering on the provided values. Such
+filter classes enable the user to build complex queries to retrieve specific
+annotations without needing to know column or table names or the layout of the
+underlying databases. While initially being developed to be used in the
+`r Biocpkg("Organism.dplyr")` and `r Biocpkg("ensembldb")` packages, the filter
+classes and the related filtering concept can be easily added to other
+annotation packages too.
+
+
+# Filter classes
+
+All filter classes extend the basic `AnnotationFilter` class and take one or
+more *values* and a *condition* to allow filtering on a single database table
+column. Based on the type of the input value, filter classes are divided into:
+
+- `CharacterFilter`: takes a `character` value of length >= 1 and supports
+ conditions `==`, `!=`, `startsWith` and `endsWith`. An example would be a
+ `GeneIdFilter` that allows to filter on gene IDs.
+
+- `IntegerFilter`: takes a single `integer` as input and supports the conditions
+ `==`, `!=`, `>`, `<`, `>=` and `<=`. An example would be a `GeneStartFilter`
+ that filters results on the (chromosomal) start coordinates of genes.
+
+- `GRangesFilter`: is a special filter, as it takes a `GRanges` as `value` and
+ performs the filtering on a combination of columns (i.e. start and end
+ coordinate as well as sequence name and strand). To be consistent with the
+ `findOverlaps` method from the `r Biocpkg("IRanges")` package, the constructor
+ of the `GRangesFilter` filter takes a `type` argument to define its
+ condition. Supported values are `"any"` (the default) that retrieves all
+ entries overlapping the `GRanges`, `"start"` and `"end"` matching all features
+ with the same start and end coordinate respectively, `"within"` that matches
+ all features that are *within* the range defined by the `GRanges` and
+ `"equal"` that returns features that are equal to the `GRanges`.
+
+The names of the filter classes are intuitive, the first part corresponding to
+the database column name with each character following a `_` being capitalized,
+followed by the key word `Filter`. The name of a filter for a database table
+column `gene_id` is thus called `GeneIdFilter`. The default database column for
+a filter is stored in its `field` slot (accessible *via* the `field` method).
+
+The `supportedFilters` method can be used to get an overview of all available
+filter objects defined in `AnnotationFilter`.
+
+```{r supportedFilters}
+library(AnnotationFilter)
+supportedFilters()
+```
+
+Note that the `AnnotationFilter` package does provides only the filter classes
+but not the functionality to apply the filtering. Such functionality is
+annotation resource and database layout dependent and needs thus to be
+implemented in the packages providing access to annotation resources.
+
+
+# Usage
+
+Filters are created *via* their dedicated constructor functions, such as the
+`GeneIdFilter` function for the `GeneIdFilter` class. Because of this simple and
+cheap creation, filter classes are thought to be *read-only* and thus don't
+provide *setter* methods to change their slot values. In addition to the
+constructor functions, `AnnotationFilter` provides the functionality to
+*translate* query expressions into filter classes (see further below for an
+example).
+
+Below we create a `SymbolFilter` that could be used to filter an annotation
+resource to retrieve all entries associated with the specified symbol value(s).
+
+```{r symbol-filter}
+library(AnnotationFilter)
+
+smbl <- SymbolFilter("BCL2")
+smbl
+```
+
+Such a filter is supposed to be used to retrieve all entries associated to
+features with a value in a database table column called *symbol* matching the
+filter's value `"BCL2"`.
+
+Using the `"startsWith"` condition we could define a filter to retrieve all
+entries for genes with a gene name/symbol starting with the specified value
+(e.g. `"BCL2"` and `"BCL2L11"` for the example below.
+
+```{r symbol-startsWith}
+smbl <- SymbolFilter("BCL2", condition = "startsWith")
+smbl
+```
+
+In addition to the constructor functions, `AnnotationFilter` provides a
+functionality to create filter instances in a more natural and intuitive way by
+*translating* filter expressions (written as a *formula*, i.e. starting with a
+`~`).
+
+```{r convert-expression}
+smbl <- AnnotationFilter(~ symbol == "BCL2")
+smbl
+```
+
+Individual `AnnotationFilter` objects can be combined in an
+`AnnotationFilterList`. This class extends `list` and provides an additional
+`logOp` slot that defines how its individual filters are supposed to be
+combined. The length of `logOp` has to be 1 less than the number of filter
+objects. Each element in `logOp` defines how two consecutive filters should
+be combined. Below we create a `AnnotationFilterList` containing two filter
+objects to be combined with a logical *AND*.
+
+```{r convert-multi-expression}
+flt <- AnnotationFilter(~ symbol == "BCL2" &
+ tx_biotype == "protein_coding")
+flt
+```
+
+Note that the `AnnotationFilter` function does not (yet) support translation of
+nested expressions, such as `(symbol == "BCL2L11" & tx_biotype ==
+"nonsense_mediated_decay") | (symbol == "BCL2" & tx_biotype ==
+"protein_coding")`. Such queries can however be build by nesting
+`AnnotationFilterList` classes.
+
+```{r nested-query}
+## Define the filter query for the first pair of filters.
+afl1 <- AnnotationFilterList(SymbolFilter("BCL2L11"),
+ TxBiotypeFilter("nonsense_mediated_decay"))
+## Define the second filter pair in ( brackets should be combined.
+afl2 <- AnnotationFilterList(SymbolFilter("BCL2"),
+ TxBiotypeFilter("protein_coding"))
+## Now combine both with a logical OR
+afl <- AnnotationFilterList(afl1, afl2, logOp = "|")
+
+afl
+```
+
+This `AnnotationFilterList` would now select all entries for all transcripts of
+the gene *BCL2L11* with the biotype *nonsense_mediated_decay* or for all protein
+coding transcripts of the gene *BCL2*.
+
+
+# Using `AnnotationFilter` in other packages
+
+The `AnnotationFilter` package does only provide filter classes, but no
+filtering functionality. This has to be implemented in the package using the
+filters. In this section we first show in a very simple example how
+`AnnotationFilter` classes could be used to filter a `data.frame` and
+subsequently explore how a simple filter framework could be implemented for a
+SQL based annotation resources.
+
+Let's first define a simple `data.frame` containing the data we want to
+filter. Note that subsetting this `data.frame` using `AnnotationFilter` is
+obviously not the best solution, but it should help to understand the basic
+concept.
+
+```{r define-data.frame}
+## Define a simple gene table
+gene <- data.frame(gene_id = 1:10,
+ symbol = c(letters[1:9], "b"),
+ seq_name = paste0("chr", c(1, 4, 4, 8, 1, 2, 5, 3, "X", 4)),
+ stringsAsFactors = FALSE)
+gene
+```
+
+Next we generate a `SymbolFilter` and inspect what information we can extract
+from it.
+
+```{r simple-symbol}
+smbl <- SymbolFilter("b")
+```
+
+We can access the filter *condition* using the `condition` method
+
+```{r simple-symbol-condition}
+condition(smbl)
+```
+
+The value of the filter using the `value` method
+
+```{r simple-symbol-value}
+value(smbl)
+```
+
+And finally the *field* (i.e. column in the data table) using the `field`
+method.
+
+```{r simple-symbol-field}
+field(smbl)
+```
+
+With this information we can define a simple function that takes the data table
+and the filter as input and returns a `logical` with length equal to the number
+of rows of the table, `TRUE` for rows matching the filter.
+
+```{r doMatch}
+
+doMatch <- function(x, filter) {
+ do.call(condition(filter), list(x[, field(filter)], value(filter)))
+}
+
+## Apply this function
+doMatch(gene, smbl)
+
+```
+
+Note that this simple function does not support multiple filters and also not
+conditions `"startsWith"` or `"endsWith"`. Next we define a second function that
+extracts the relevant data from the data resource.
+
+```{r doExtract}
+
+doExtract <- function(x, filter) {
+ x[doMatch(x, filter), ]
+}
+
+## Apply it on the data
+doExtract(gene, smbl)
+```
+
+We could even modify the `doMatch` function to enable filter expressions.
+
+```{r doMatch-formula}
+
+doMatch <- function(x, filter) {
+ if (is(filter, "formula"))
+ filter <- AnnotationFilter(filter)
+ do.call(condition(filter), list(x[, field(filter)], value(filter)))
+}
+
+doExtract(gene, ~ gene_id == '2')
+
+```
+
+For such simple examples `AnnotationFilter` might be an overkill as the same
+could be achieved (much simpler) using standard R operations. A real case
+scenario in which `AnnotationFilter` becomes useful are SQL-based annotation
+resources. We will thus explore next how SQL resources could be filtered using
+`AnnotationFilter`.
+
+We use the SQLite database from the `r Biocpkg("org.Hs.eg.db")` package that
+provides a variety of annotations for all human genes. Using the packages'
+connection to the database we inspect first what database tables are available
+and then select one for our simple filtering example.
+
+We use an `EnsDb` SQLite database used by the `r Biocpkg("ensembldb")` package
+and implement simple filter functions to extract specific data from one of its
+database tables. We thus load below the `EnsDb.Hsapiens.v75` package that
+provides access to human gene, transcript, exon and protein annotations. Using
+its connection to the database we inspect first what database tables are
+available and then what *fields* (i.e. columns) the *gene* table has.
+
+```{r orgDb, message = FALSE}
+## Load the required packages
+library(org.Hs.eg.db)
+library(RSQLite)
+## Get the database connection
+dbcon <- org.Hs.eg_dbconn()
+
+## What tables do we have?
+dbListTables(dbcon)
+```
+
+`org.Hs.eg.db` provides many different tables, one for each identifier or
+annotation resource. We will use the *gene_info* table and determine which
+*fields* (i.e. columns) the table provides.
+
+```{r gene_info}
+## What fields are there in the gene_info table?
+dbListFields(dbcon, "gene_info")
+```
+
+The *gene_info* table provides the official gene symbol and the gene name. The
+column *symbol* matches the default `field` value of the `SymbolFilter`. For the
+`GenenameFilter` we would have to re-map its default field `"genename"` to the
+database column *gene_name*. There are many possibilities to do this, one would
+be to implement an own function to extract the field from the `AnnotationFilter`
+classes specific to the database. This function eventually renames the extracted
+field value to match the corresponding name of the database column name.
+
+We next implement a simple `doExtractGene` function that retrieves data from the
+*gene_info* table and re-uses the `doFilter` function to extract specific
+data. The parameter `x` is now the database connection object.
+
+```{r doExtractSQL}
+
+doExtractGene <- function(x, filter) {
+ gene <- dbGetQuery(x, "select * from gene_info")
+ doExtract(gene, filter)
+}
+
+## Extract all entries for BCL2
+bcl2 <- doExtractGene(dbcon, SymbolFilter("BCL2"))
+
+bcl2
+```
+
+This works, but is not really efficient, since the function first fetches the
+full database table and subsets it only afterwards. A much more efficient
+solution is to *translate* the `AnnotationFilter` class(es) to an SQL *where*
+condition and hence perform the filtering on the database level. Here we have to
+do some small modifications, since not all condition values can be used 1:1 in
+SQL calls. The condition `"=="` has for example to be converted into `"="` and
+the `"startsWith"` into a SQL `"like"` by adding also a `"%"` wildcard to the
+value of the filter. We would also have to deal with filters that have a `value`
+of length > 1. A `SymbolFilter` with a `value` being `c("BCL2", "BCL2L11")`
+would for example have to be converted to a SQL call `"symbol in
+('BCL2','BCL2L11')"`. Here we skip these special cases and define a simple
+function that translates an `AnnotationFilter` to a *where* condition to be
+included into the SQL call. Depending on whether the filter extends
+`CharacterFilter` or `IntegerFilter` the value has also to be quoted.
+
+```{r simpleSQL}
+
+## Define a simple function that covers some condition conversion
+conditionForSQL <- function(x) {
+ switch(x,
+ "==" = "=",
+ x)
+}
+
+## Define a function to translate a filter into an SQL where condition.
+## Character values have to be quoted.
+where <- function(x) {
+ if (is(x, "CharacterFilter"))
+ value <- paste0("'", value(x), "'")
+ else value <- value(x)
+ paste0(field(x), conditionForSQL(condition(x)), value)
+}
+
+## Now "translate" a filter using this function
+where(SeqNameFilter("Y"))
+
+```
+
+Next we implement a new function which integrates the filter into the SQL call
+to let the database server take care of the filtering.
+
+```{r doExtractGene2}
+
+## Define a function that
+doExtractGene2 <- function(x, filter) {
+ if (is(filter, "formula"))
+ filter <- AnnotationFilter(filter)
+ query <- paste0("select * from gene_info where ", where(filter))
+ dbGetQuery(x, query)
+}
+
+bcl2 <- doExtractGene2(dbcon, ~ symbol == "BCL2")
+bcl2
+
+```
+
+Below we compare the performance of both approaches.
+
+```{r performance}
+system.time(doExtractGene(dbcon, ~ symbol == "BCL2"))
+
+system.time(doExtractGene2(dbcon, ~ symbol == "BCL2"))
+
+```
+
+Not surprisingly, the second approach is much faster.
+
+Be aware that the examples shown here are only for illustration purposes. In a
+real world situation additional factors, like combinations of filters, which
+database tables to join, which columns to be returned etc would have to be
+considered too.
+
+# Session information
+
+```{r si}
+sessionInfo()
+```
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/r-bioc-annotationfilter.git
More information about the debian-med-commit
mailing list