[med-svn] [r-bioc-biomart] 01/05: New upstream version 2.34.0
Andreas Tille
tille at debian.org
Wed Nov 8 13:43:11 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository r-bioc-biomart.
commit 2380ae3693ae779a918f4ecddd64bc0cfd232f50
Author: Andreas Tille <tille at debian.org>
Date: Wed Nov 8 14:32:09 2017 +0100
New upstream version 2.34.0
---
DESCRIPTION | 6 +-
NAMESPACE | 12 +-
NEWS | 21 ++
R/biomaRt.R | 317 ++++++++++--------------
R/ensembl.R | 24 ++
R/methods-Mart.R | 20 +-
R/utilityFunctions.R | 115 ++++++++-
build/vignette.rds | Bin 211 -> 212 bytes
inst/doc/biomaRt.R | 13 +-
inst/doc/biomaRt.Rmd | 20 +-
inst/doc/biomaRt.html | 464 +++++++++++++++++++++++------------
man/listEnsemblArchives.Rd | 18 ++
man/listMarts.Rd | 6 +-
man/useMart.Rd | 2 +-
tests/testthat/test_hostProcessing.R | 16 ++
tests/testthat/test_useMart.R | 13 +
vignettes/biomaRt.Rmd | 20 +-
17 files changed, 721 insertions(+), 366 deletions(-)
diff --git a/DESCRIPTION b/DESCRIPTION
index ca0db64..263e019 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,5 +1,5 @@
Package: biomaRt
-Version: 2.32.1
+Version: 2.34.0
Title: Interface to BioMart databases (e.g. Ensembl, COSMIC, Wormbase
and Gramene)
Author: Steffen Durinck <biomartdev at gmail.com>, Wolfgang Huber
@@ -7,7 +7,7 @@ Contributors: Sean Davis <sdavis2 at mail.nih.gov>, Francois Pepin, Vince
S. Buffalo, Mike Smith
Maintainer: Steffen Durinck <biomartdev at gmail.com>
Depends: methods
-Imports: utils, XML, RCurl, AnnotationDbi
+Imports: utils, XML, RCurl, AnnotationDbi, progress, stringr
Suggests: annotate, BiocStyle, knitr, rmarkdown, testthat
VignetteBuilder: knitr
biocViews: Annotation
@@ -27,4 +27,4 @@ Description: In recent years a wealth of biological data has become available
License: Artistic-2.0
LazyLoad: yes
NeedsCompilation: no
-Packaged: 2017-06-08 22:19:24 UTC; biocbuild
+Packaged: 2017-10-30 22:42:08 UTC; biocbuild
diff --git a/NAMESPACE b/NAMESPACE
index b006204..8dcf92a 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,10 +1,20 @@
import(methods)
import(RCurl,XML)
+
importFrom(utils, edit, head, read.table)
importFrom(AnnotationDbi, keys, columns, keytypes, select)
+importFrom(progress, progress_bar)
+importFrom(stringr, str_extract_all)
+
#for some reason RCurl needs to have findHTTPHeaderEncoding exported -
#remove it from the exports if this ever gets fixed
-export(listMarts, getGene, getSequence, exportFASTA, useMart, listDatasets, useDataset, listEnsembl, useEnsembl, listAttributes, listFilters, getBM, getXML,getLDS, attributePages, filterOptions,filterType, getBMlist, NP2009code, keys, columns, keytypes, select)
+export(listMarts, getGene, getSequence, exportFASTA, useMart, listDatasets,
+ useDataset, listEnsembl, useEnsembl, listAttributes, listFilters,
+ getBM, getXML,getLDS, attributePages, filterOptions,filterType,
+ getBMlist, NP2009code, keys, columns, keytypes, select,
+ listEnsemblArchives)
+
exportClasses(Mart)
+
exportMethods("show")
diff --git a/NEWS b/NEWS
index 027282f..21be840 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,24 @@
+CHANGES IN VERSION 2.34.0
+-------------------------
+
+NEW FEATURES
+
+ o Added the listEnsemblArchives() function. This returns a table of the
+ available Ensembl archives, and replaces the archive = TRUE argument to
+ several functions, which was no longer working.
+
+BUG FIXES
+
+ o The Ensembl BioMart server doesn't always respond well if queries with
+ more than 500 filter values are submitted. If a query that exceed this is
+ detect biomaRt will now submit the query in batches and concatonate the
+ result when completed.
+
+MINOR CHANGES
+
+ o You can now provide a host with 'http://' at the start, or a trailing
+ '/' (typically copy/pasted from a browser) and useMarts() will cope.
+
CHANGES IN VERSION 2.32.0
-------------------------
diff --git a/R/biomaRt.R b/R/biomaRt.R
index ae0ddd2..0e1f8f9 100644
--- a/R/biomaRt.R
+++ b/R/biomaRt.R
@@ -83,11 +83,13 @@ listMarts <- function( mart = NULL, host="www.ensembl.org", path="/biomart/marts
"&redirect=no",
"")
+ host <- .cleanHostURL(host)
if(archive) {
- request = paste0("http://",host,":",port,path,"?type=registry_archive&requestid=biomaRt")
+ warning("The archive = TRUE argument is now deprecated.\nUse listEnsemblMarts() to find the URL to directly query an Ensembl archive.")
+ request = paste0(host, ":", port, path, "?type=registry_archive&requestid=biomaRt")
}
else {
- request = paste0("http://", host, ":", port, path, "?type=registry&requestid=biomaRt", redirect)
+ request = paste0(host, ":", port, path, "?type=registry&requestid=biomaRt", redirect)
}
}
else{
@@ -178,11 +180,13 @@ useMart <- function(biomart, dataset, host = "www.ensembl.org", path = "/biomart
stop("biomart argument is not a string. ",
"The biomart argument should be a single character string")
}
- #if(biomart == "ensembl" & (host == "www.ensembl.org" | host == "uswest.ensembl.org")){
+
if(biomart == "ensembl" & grepl(x = host, pattern = "ensembl.org")) {
biomart = "ENSEMBL_MART_ENSEMBL"
}
+
reqHost = host
+ host <- .cleanHostURL(host)
marts <- listMarts(host=host, path=path, port=port, includeHosts = TRUE,
archive = archive, ssl.verifypeer = ssl.verifypeer,
@@ -468,201 +472,146 @@ filterType <- function(filter, mart){
##########################################
getBM <- function(attributes, filters = "", values = "", mart, curl = NULL, checkFilters = TRUE, verbose=FALSE, uniqueRows=TRUE, bmHeader=FALSE, quote="\""){
-
- martCheck(mart)
- if(missing( attributes ))
- stop("Argument 'attributes' must be specified.")
-
- if(is.list(filters) && !missing( values ))
- warning("Argument 'values' should not be used when argument 'filters' is a list and will be ignored.")
- if(is.list(filters) && is.null(names(filters)))
- stop("Argument 'filters' must be a named list when sent as a list.")
- if(!is.list(filters) && filters != "" && missing( values ))
+
+ martCheck(mart)
+ if(missing( attributes ))
+ stop("Argument 'attributes' must be specified.")
+
+ if(is.list(filters) && !missing( values ))
+ warning("Argument 'values' should not be used when argument 'filters' is a list and will be ignored.")
+ if(is.list(filters) && is.null(names(filters)))
+ stop("Argument 'filters' must be a named list when sent as a list.")
+ if(!is.list(filters) && filters != "" && missing( values ))
stop("Argument 'values' must be specified.")
-
- if(length(filters) > 0 && length(values) == 0)
- stop("Values argument contains no data.")
-
- if(is.list(filters)){
- values = filters
- filters = names(filters)
- }
-
- if(class(uniqueRows) != "logical")
- stop("Argument 'uniqueRows' must be a logical value, so either TRUE or FALSE")
-
- ## force the query to return the 'english text' header names with the result
- ## we use these later to match and order attribute/column names
- callHeader <- TRUE
- xmlQuery = paste0("<?xml version='1.0' encoding='UTF-8'?><!DOCTYPE Query><Query virtualSchemaName = '",
- martVSchema(mart),
- "' uniqueRows = '",
- as.numeric(uniqueRows),
- "' count = '0' datasetConfigVersion = '0.6' header='",
- as.numeric(callHeader),
- "' requestid= 'biomaRt'> <Dataset name = '",
- martDataset(mart),"'>")
-
- #checking the Attributes
- invalid = !(attributes %in% listAttributes(mart, what="name"))
- if(any(invalid))
- stop(paste("Invalid attribute(s):", paste(attributes[invalid], collapse=", "),
- "\nPlease use the function 'listAttributes' to get valid attribute names"))
-
- #check if attributes come from multiple attribute pages currently disabled until ID issue resovled at Ensembl
- if(FALSE){
- att = listAttributes(mart, what=c("name","page"))
- att = att[which(att[,1] %in% attributes),]
- attOK = FALSE
- pages = unique(att[,2])
- if(length(pages) <= 1){
- attOK = TRUE
- }
- else{
- for(page in pages){
- if(length(attributes) == length(which(attributes %in% att[which(att[,2] == page),1]))) attOK = TRUE
- }
- }
- if(!attOK){
- stop(paste("Querying attributes from multiple attribute pages is not allowed. To see the attribute pages attributes belong to, use the function attributePages."))
+
+ if(length(filters) > 0 && length(values) == 0)
+ stop("Values argument contains no data.")
+
+ if(is.list(filters)){
+ values = filters
+ filters = names(filters)
}
- }
- #attribute are ok lets add them to the query
- attributeXML = paste("<Attribute name = '", attributes, "'/>", collapse="", sep="")
-
- #checking the filters
- if(filters[1] != "" && checkFilters){
- invalid = !(filters %in% listFilters(mart, what="name"))
+
+ if(class(uniqueRows) != "logical")
+ stop("Argument 'uniqueRows' must be a logical value, so either TRUE or FALSE")
+
+ ## force the query to return the 'english text' header names with the result
+ ## we use these later to match and order attribute/column names
+ callHeader <- TRUE
+ xmlQuery = paste0("<?xml version='1.0' encoding='UTF-8'?><!DOCTYPE Query><Query virtualSchemaName = '",
+ martVSchema(mart),
+ "' uniqueRows = '",
+ as.numeric(uniqueRows),
+ "' count = '0' datasetConfigVersion = '0.6' header='",
+ as.numeric(callHeader),
+ "' requestid= 'biomaRt'> <Dataset name = '",
+ martDataset(mart),"'>")
+
+ #checking the Attributes
+ invalid = !(attributes %in% listAttributes(mart, what="name"))
if(any(invalid))
- stop(paste("Invalid filters(s):", paste(filters[invalid], collapse=", "),
- "\nPlease use the function 'listFilters' to get valid filter names"))
- }
-
- filterXML = NULL
-
- if(length(filters) > 1){
- if(class(values)!= "list")stop("If using multiple filters, the 'value' has to be a list.\nFor example, a valid list for 'value' could be: list(affyid=c('1939_at','1000_at'), chromosome= '16')\nHere we select on Affymetrix identifier and chromosome, only results that pass both filters will be returned");
-
- for(i in seq(along = filters)){
- if(filters[i] %in% listFilters(mart, what = "name")){
- filtertype=filterType(filters[i], mart)
- if(filtertype == 'boolean' || filtertype == 'boolean_list'){
- if(!is.logical(values[[i]]))
- stop("biomaRt error: ", filters[i], " is a boolean filter and needs a corresponding logical value of TRUE or FALSE to indicate if the query should retrieve all data that fulfill the boolean or alternatively that all data that not fulfill the requirement should be retrieved.")
- if(!values[[i]]){
- values[[i]] = 1
- }
- else{
- values[[i]] = 0
- }
- filterXML = paste(filterXML,paste("<Filter name = '",filters[i],"' excluded = \"",values[[i]],"\" />", collapse="",sep=""),sep="")
+ stop(paste("Invalid attribute(s):", paste(attributes[invalid], collapse=", "),
+ "\nPlease use the function 'listAttributes' to get valid attribute names"))
+
+ #check if attributes come from multiple attribute pages currently disabled until ID issue resovled at Ensembl
+ if(FALSE){
+ att = listAttributes(mart, what=c("name","page"))
+ att = att[which(att[,1] %in% attributes),]
+ attOK = FALSE
+ pages = unique(att[,2])
+ if(length(pages) <= 1){
+ attOK = TRUE
}
else{
- if(is.numeric(values[[i]])){ values[[i]] = as.integer(values[[i]])}
- valuesString = paste(values[[i]],"",collapse=",",sep="")
- filterXML = paste(filterXML,paste("<Filter name = '",filters[i],"' value = '",valuesString,"' />", collapse="",sep=""),sep="")
+ for(page in pages){
+ if(length(attributes) == length(which(attributes %in% att[which(att[,2] == page),1]))) attOK = TRUE
+ }
}
- }
- else{ #used for attributes with values as these are treated as filters in BioMart
- valuesString = paste(values[[i]],"",collapse=",",sep="")
- filterXML = paste(filterXML,paste("<Filter name = '",filters[i],"' value = '",valuesString,"' />", collapse="",sep=""),sep="")
- }
- }
- }
- else{
- if(filters != ""){
- if(is.list(values)){
- values = unlist(values)
- }
- if(filters %in% listFilters(mart, what="name")){
- filtertype =filterType(filters, mart)
- if(filtertype == 'boolean' || filtertype == 'boolean_list'){
- if(!is.logical(values)) stop(paste("biomaRt error: ",filters," is a boolean filter and needs a corresponding logical value of TRUE or FALSE to indicate if the query should retrieve all data that fulfill the boolean or alternatively that all data that not fulfill the requirement should be retrieved."), sep="")
- if(!values){
- values = 1
- }
- else{
- values = 0
- }
- filterXML = paste("<Filter name = '",filters,"' excluded = \"",values,"\" />", collapse="",sep="")
+ if(!attOK){
+ stop(paste("Querying attributes from multiple attribute pages is not allowed. To see the attribute pages attributes belong to, use the function attributePages."))
}
- else{
- if(is.numeric(values)){
- values = as.integer(values)
- }
- valuesString = paste(values,"",collapse=",",sep="")
- filterXML = paste("<Filter name = '",filters,"' value = '",valuesString,"' />", collapse="",sep="")
- }
- }
- else{ #used for attributes with values as these are treated as filters in BioMart
- valuesString = paste(values,"",collapse=",",sep="")
- filterXML = paste(filterXML,paste("<Filter name = '",filters,"' value = '",valuesString,"' />", collapse="",sep=""),sep="")
- }
}
- else{
- filterXML=""
+ #attribute are ok lets add them to the query
+ attributeXML = paste("<Attribute name = '", attributes, "'/>", collapse="", sep="")
+
+ #checking the filters
+ if(filters[1] != "" && checkFilters){
+ invalid = !(filters %in% listFilters(mart, what="name"))
+ if(any(invalid))
+ stop(paste("Invalid filters(s):", paste(filters[invalid], collapse=", "),
+ "\nPlease use the function 'listFilters' to get valid filter names"))
}
- }
-
- xmlQuery = paste(xmlQuery, attributeXML, filterXML,"</Dataset></Query>",sep="")
- if(verbose){
- cat(paste(xmlQuery,"\n", sep=""))
- }
-
- ## we choose a separator based on whether 'redirect=no' is present
- sep <- ifelse(grepl(x = martHost(mart), pattern = ".+\\?.+"), "&", "?")
-
- postRes = tryCatch(postForm(paste0(martHost(mart), sep),"query" = xmlQuery), error = function(e){stop("Request to BioMart web service failed. Verify if you are still connected to the internet. Alternatively the BioMart web service is temporarily down.")})
- if(verbose){
- writeLines("#################\nResults from server:")
- print(postRes)
- }
- if(!(is.character(postRes) && (length(postRes)==1L)))
- stop("The query to the BioMart webservice returned an invalid result: biomaRt expected a character string of length 1. Please report this to the mailing list.")
-
- if(gsub("\n", "", postRes, fixed = TRUE, useBytes = TRUE) == "") { # meaning an empty result
+ ## filterXML is a list containing filters with reduced numbers of values
+ ## to meet the 500 value limit in BioMart queries
+ filterXmlList <- .generateFilterXML(filters, values, mart)
- result = as.data.frame(matrix("", ncol=length(attributes), nrow=0), stringsAsFactors=FALSE)
+ resultList <- list()
+ if(length(filterXmlList) > 1) {
+ pb <- progress_bar$new(total = length(filterXmlList),
+ width = options()$width - 10,
+ format = "Batch submitting query [:bar] :percent eta: :eta")
+ pb$tick(0)
+ }
- } else {
+ ## we submit a query for each chunk of the filter list
+ for(i in seq_along(filterXmlList)) {
+
+ if(exists('pb')) {
+ pb$tick()
+ }
+
+ filterXML <- filterXmlList[[ i ]]
+ fullXmlQuery = paste(xmlQuery, attributeXML, filterXML,"</Dataset></Query>",sep="")
+
+ if(verbose) {
+ message(fullXmlQuery)
+ }
+
+ ## we choose a separator based on whether '?redirect=no' is present
+ sep <- ifelse(grepl(x = martHost(mart), pattern = ".+\\?.+"), "&", "?")
+
+ postRes = tryCatch(postForm(paste0(martHost(mart), sep),"query" = fullXmlQuery),
+ error = function(e) {
+ stop("Request to BioMart web service failed. Verify if you are still connected to the internet. Alternatively the BioMart web service is temporarily down.")
+ }
+ )
+ if(verbose){
+ writeLines("#################\nResults from server:")
+ print(postRes)
+ }
+ if(!(is.character(postRes) && (length(postRes)==1L)))
+ stop("The query to the BioMart webservice returned an invalid result: biomaRt expected a character string of length 1. Please report this to the mailing list.")
+
+ if(gsub("\n", "", postRes, fixed = TRUE, useBytes = TRUE) == "") { # meaning an empty result
+
+ result = as.data.frame(matrix("", ncol=length(attributes), nrow=0), stringsAsFactors=FALSE)
+
+ } else {
+
+ if(length(grep("^Query ERROR", postRes))>0L)
+ stop(postRes)
+
+ ## convert the serialized table into a dataframe
+ con = textConnection(postRes)
+ result = read.table(con, sep="\t", header=callHeader, quote = quote, comment.char = "", check.names = FALSE, stringsAsFactors=FALSE)
+ if(verbose){
+ writeLines("#################\nParsed results:")
+ print(result)
+ }
+ close(con)
+
+ if(!(is(result, "data.frame") && (ncol(result)==length(attributes)))) {
+ print(head(result))
+ stop("The query to the BioMart webservice returned an invalid result: the number of columns in the result table does not equal the number of attributes in the query. Please report this to the mailing list.")
+ }
+ }
- if(length(grep("^Query ERROR", postRes))>0L)
- stop(postRes)
-
- ## convert the serialized table into a dataframe
- con = textConnection(postRes)
- result = read.table(con, sep="\t", header=callHeader, quote = quote, comment.char = "", check.names = FALSE, stringsAsFactors=FALSE)
- if(verbose){
- writeLines("#################\nParsed results:")
- print(result)
+ resultList[[i]] <- .setResultColNames(result, mart = mart, attributes = attributes, bmHeader = bmHeader)
}
- close(con)
-
- if(!(is(result, "data.frame") && (ncol(result)==length(attributes)))) {
- print(head(result))
- stop("The query to the BioMart webservice returned an invalid result: the number of columns in the result table does not equal the number of attributes in the query. Please report this to the mailing list.")
- }
- }
- # if(!bmHeader){ #assumes order of results same as order of attibutes in input
- # colnames(result) = attributes
- # }
- # else{
- # toAttributeName=FALSE
- # if(toAttributeName){ #set to TRUE if attempting to replace attribute descriptions with attribute names
- # att = listAttributes(mart)
- # resultNames = colnames(result)
- # for(r in 1:length(resultNames)){
- # asel = which(att[,2] == resultNames[r])
- # if(length(asel) == 1){
- # resultNames[r] = att[asel,1]
- # }
- # }
- # colnames(result) = resultNames
- # }
- # }
- result <- .setResultColNames(result, mart = mart, attributes = attributes, bmHeader = bmHeader)
- return(result)
+ ## collate results
+ result <- do.call('rbind', resultList)
+ return(result)
}
###################################
diff --git a/R/ensembl.R b/R/ensembl.R
new file mode 100644
index 0000000..8459455
--- /dev/null
+++ b/R/ensembl.R
@@ -0,0 +1,24 @@
+## location of Ensembl specific functions
+
+## scrapes the ensembl website for the list of current archives and returns
+## a data frame containing the versions and their URL
+listEnsemblArchives <- function() {
+
+ html <- htmlParse("http://www.ensembl.org/info/website/archives/index.html")
+
+ archive_box <- getNodeSet(html, path = "//div[@class='plain-box float-right archive-box']")[[1]]
+
+ archive_box_string <- toString.XMLNode(archive_box)
+
+ archives <- strsplit(archive_box_string, split = "<li>")[[1]][-1]
+
+ extracted <- str_extract_all(string = archives,
+ pattern = "Ensembl [A-Za-z0-9 ]{2,6}|http://.*ensembl\\.org|[A-Z][a-z]{2} [0-9]{4}")
+
+ tab <- do.call("rbind", extracted)
+ colnames(tab) <- c("url", "version", "date")
+ tab <- tab[,c(2,3,1)]
+
+ return(tab)
+}
+
diff --git a/R/methods-Mart.R b/R/methods-Mart.R
index 8e6a2f8..9a17cc7 100644
--- a/R/methods-Mart.R
+++ b/R/methods-Mart.R
@@ -1,8 +1,20 @@
setMethod("show",signature(object="Mart"),
- function(object){
- res = paste("Object of class 'Mart':\n Using the ",object at biomart," BioMart database\n Using the ",object at dataset," dataset\n", sep="")
- cat(res)
-})
+ function(object){
+
+ dbase <- ifelse(nchar(object at biomart) != 0,
+ yes = paste(" Using the", object at biomart, "BioMart database"),
+ no = " No database selected.")
+
+ dset <- ifelse(nchar(object at dataset) != 0,
+ yes = paste(" Using the", object at dataset, "dataset"),
+ no = " No dataset selected.")
+
+ res <- paste("Object of class 'Mart':",
+ dbase,
+ dset,
+ sep="\n")
+ cat(res)
+ })
setGeneric("martBM",def=function(obj,...) standardGeneric("martBM"))
setMethod("martBM",signature("Mart"), function(obj) obj at biomart)
diff --git a/R/utilityFunctions.R b/R/utilityFunctions.R
index 982b2c7..bbe89e5 100644
--- a/R/utilityFunctions.R
+++ b/R/utilityFunctions.R
@@ -34,4 +34,117 @@
result <- result[, match(att[matches,1], attributes), drop=FALSE]
return(result)
-}
\ No newline at end of file
+}
+
+## BioMart doesn't work well if the list of values provided to a filter is
+## longer than 500 values. It returns only a subset of the requested data
+## and does so silently! This function is designed to take a list of provided
+## filters, and split any longer than 'maxChunkSize'. It operates recursively
+## incase there are multiple filters that need splitting, and should ensure
+## all possible groupings of filters are retained.
+.splitValues <- function(valuesList, maxChunkSize = 500) {
+
+ vLength <- vapply(valuesList[[1]], FUN = length, FUN.VALUE = integer(1))
+
+ if(all(vLength <= maxChunkSize)) {
+ return(valuesList)
+ } else {
+ ## pick the next filter to split
+ vIdx <- min(which(vLength > maxChunkSize))
+
+ nchunks <- (vLength[vIdx] %/% maxChunkSize) + 1
+ splitIdx <- rep(1:nchunks, each = ceiling(vLength[vIdx] / nchunks))[ 1:vLength[vIdx] ]
+
+ ## a new list we will populate with the chunks
+ tmpList <- list()
+ for(i in 1:nchunks) {
+ for( j in 1:length(valuesList) ) {
+ listIdx <- ((i - 1) * length(valuesList)) + j
+ tmpList[[ listIdx ]] <- valuesList[[j]]
+ tmpList[[ listIdx ]][[ vIdx ]] <- tmpList[[ listIdx ]][[ vIdx ]][which(splitIdx == i)]
+ }
+ }
+ ## recursively call the function to process next filter
+ valuesList <- .splitValues(tmpList)
+ }
+ return(valuesList)
+}
+
+## Creating the filter XML for a single chunk of values. Returns a character
+## vector containing the XML lines for all specified filters & their
+## attributes spliced together into a single string.
+.createFilterXMLchunk <- function(filterChunk, mart) {
+
+ individualFilters <- vapply(names(filterChunk),
+ FUN = function(filter, values, mart) {
+
+ ## if the filter exists and is boolean we do this
+ if(filter %in% listFilters(mart, what = "name") &&
+ grepl('boolean', filterType(filter = filter, mart = mart)) ) {
+ if(!is.logical(values[[filter]]))
+ stop("biomaRt error:\n",
+ filter, " is a boolean filter and needs a corresponding logical value of TRUE or FALSE to indicate if the query should retrieve all data that fulfill the boolean or alternatively that all data that not fulfill the requirement should be retrieved.")
+ val <- ifelse(values[[filter]], yes = 0, no = 1)
+ val <- paste0("' excluded = \"", val, "\" ")
+
+ } else {
+ ## otherwise the filter isn't boolean, or doesn't exist
+
+ if(is.numeric(values[[filter]]))
+ values[[filter]] <- as.integer(values[[filter]])
+ val <- paste0(values[[filter]], collapse = ",")
+ val <- paste0("' value = '", val, "' ")
+ }
+ filterXML <- paste0("<Filter name = '", filter, val, "/>")
+ return(filterXML)
+ }, FUN.VALUE = character(1),
+ filterChunk, mart,
+ USE.NAMES = FALSE)
+
+ filterXML <- paste0(individualFilters, collapse = "")
+ return(filterXML)
+}
+
+.generateFilterXML <- function(filters = "", values, mart) {
+
+ ## return emptry string if no filter specified
+ if(filters[1]== "") {
+ return("")
+ }
+ ## if we have multiple filters, the values must be specified as a list.
+ if(length(filters) > 1 && class(values) != "list") {
+ stop("If using multiple filters, the 'value' has to be a list.\nFor example, a valid list for 'value' could be: list(affyid=c('1939_at','1000_at'), chromosome= '16')\nHere we select on Affymetrix identifier and chromosome, only results that pass both filters will be returned");
+ }
+ ## it's easy to not realise you're passing a data frame here, so check
+ if(is.data.frame(values) && ncol(values == 1)) {
+ values <- values[,1]
+ }
+
+
+ if(!is.list(values)){
+ values <- list(values)
+ }
+ names(values) <- filters
+
+ values <- .splitValues(list(values))
+
+ filterXML_list <- lapply(values, .createFilterXMLchunk, mart)
+}
+
+#' it seems like pretty common practice for users to copy and paste the host
+#' name from a browser if they're not accessing Ensembl. Typically this will
+#' include the "http://" and maybe a trailing "/" and this messes up or
+#' paste the complete URL strategy and produces something invalid.
+#' This function tidies that up to catch common variants.
+.cleanHostURL <- function(host) {
+
+ ## strip trailing slash
+ host <- gsub(pattern = "/$", replacement = "", x = host)
+
+ ## only prepend http if needed
+ if(!grepl(pattern = "^http://|^https://", x = host)) {
+ host <- paste0("http://", host)
+ }
+
+ return(host)
+}
diff --git a/build/vignette.rds b/build/vignette.rds
index daeeff4..fef17e9 100644
Binary files a/build/vignette.rds and b/build/vignette.rds differ
diff --git a/inst/doc/biomaRt.R b/inst/doc/biomaRt.R
index 658a278..aa42009 100644
--- a/inst/doc/biomaRt.R
+++ b/inst/doc/biomaRt.R
@@ -1,4 +1,4 @@
-## ----setup, cache = F, echo = FALSE--------------------------------------
+## ----setup, cache = F, echo = FALSE----------------------------------------
knitr::opts_chunk$set(error = TRUE)
## ----annotate,echo=FALSE----------------------------------------------------------------------------------------------
@@ -129,11 +129,14 @@ listMarts(archive = TRUE)
## ----archiveMarts2, echo = TRUE, eval = TRUE--------------------------------------------------------------------------
ensembl = useMart("ensembl_mart_46", dataset="hsapiens_gene_ensembl", archive = TRUE)
+## ----archiveMarts, echo = TRUE, eval = TRUE---------------------------------------------------------------------------
+listEnsemblArchives()
+
## ----archiveMarts3, echo = TRUE, eval = TRUE--------------------------------------------------------------------------
-listMarts(host='may2009.archive.ensembl.org')
-ensembl54=useMart(host='may2009.archive.ensembl.org',
- biomart='ENSEMBL_MART_ENSEMBL',
- dataset='hsapiens_gene_ensembl')
+listMarts(host = 'may2009.archive.ensembl.org')
+ensembl54 <- useMart(host='may2009.archive.ensembl.org',
+ biomart='ENSEMBL_MART_ENSEMBL',
+ dataset='hsapiens_gene_ensembl')
## ----wormbase, echo=TRUE, eval=TRUE-----------------------------------------------------------------------------------
listMarts(host = "parasite.wormbase.org")
diff --git a/inst/doc/biomaRt.Rmd b/inst/doc/biomaRt.Rmd
index ed3b60f..c404e64 100644
--- a/inst/doc/biomaRt.Rmd
+++ b/inst/doc/biomaRt.Rmd
@@ -326,14 +326,24 @@ After you selected the BioMart database and dataset, queries can be performed in
## Accessing archives through specifying the archive host
-->
-Use the <http://www.ensembl.org> website and go down the bottom of the page. Click on 'view in Archive' and select the archive you need. Copy the url and use that url as shown below to connect to the specified BioMart database. The example below shows how to query Ensembl 54.
+`r Biocpkg("biomaRt")` provides the function `listEnsemblArchives()` to view the available archives. This function takes no arguments, and produces a table containing the names of the available archived versions, the date they were first available, and the URL where they can be accessed.
+
+```{r archiveMarts, echo = TRUE, eval = TRUE}
+listEnsemblArchives()
+```
+
+Alternatively, one can use the <http://www.ensembl.org> website to find archived version. From the main page scroll down the bottom of the page, click on 'view in Archive' and select the archive you need.
+
+*You will notice that there is an archive URL even for the current release of Ensembl. It can be useful to use this if you wish to ensure that script you write now will return exactly the same results in the future. Using `www.ensembl.org` will always access the current release, and so the data retrieved may change over time as new releases come out.*
+
+Whichever method you use to find the URL of the archive you wish to query, copy the url and use that in the `host` argument as shown below to connect to the specified BioMart database. The example below shows how to query Ensembl 54.
```{r archiveMarts3, echo = TRUE, eval = TRUE}
-listMarts(host='may2009.archive.ensembl.org')
-ensembl54=useMart(host='may2009.archive.ensembl.org',
- biomart='ENSEMBL_MART_ENSEMBL',
- dataset='hsapiens_gene_ensembl')
+listMarts(host = 'may2009.archive.ensembl.org')
+ensembl54 <- useMart(host='may2009.archive.ensembl.org',
+ biomart='ENSEMBL_MART_ENSEMBL',
+ dataset='hsapiens_gene_ensembl')
```
diff --git a/inst/doc/biomaRt.html b/inst/doc/biomaRt.html
index e03348c..96d2cd4 100644
--- a/inst/doc/biomaRt.html
+++ b/inst/doc/biomaRt.html
@@ -4,23 +4,30 @@
<head>
-<meta charset="utf-8">
+<meta charset="utf-8" />
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="generator" content="pandoc" />
+
<meta name="author" content="Steffen Durinck, Wolfgang Huber, Mike Smith" />
+<meta name="date" content="2017-10-30" />
<title>The biomaRt users guide</title>
+<script src="data:application/x-javascript;base64,LyohIGpRdWVyeSB2MS4xMS4zIHwgKGMpIDIwMDUsIDIwMTUgalF1ZXJ5IEZvdW5kYXRpb24sIEluYy4gfCBqcXVlcnkub3JnL2xpY2Vuc2UgKi8KIWZ1bmN0aW9uKGEsYil7Im9iamVjdCI9PXR5cGVvZiBtb2R1bGUmJiJvYmplY3QiPT10eXBlb2YgbW9kdWxlLmV4cG9ydHM/bW9kdWxlLmV4cG9ydHM9YS5kb2N1bWVudD9iKGEsITApOmZ1bmN0aW9uKGEpe2lmKCFhLmRvY3VtZW50KXRocm93IG5ldyBFcnJvcigialF1ZXJ5IHJlcXVpcmVzIGEgd2luZG93IHdpdGggYSBkb2N1bWVudCIpO3JldHVybiBiKGEpfTpiKGEpfSgidW5kZWZpbmVkIiE9dHlwZW9mIHdpbmRvdz93aW5kb3c6dG [...]
+<meta name="viewport" content="width=device-width, initial-scale=1" />
+<link href="data:text/css;charset=utf-8,html%7Bfont%2Dfamily%3Asans%2Dserif%3B%2Dwebkit%2Dtext%2Dsize%2Dadjust%3A100%25%3B%2Dms%2Dtext%2Dsize%2Dadjust%3A100%25%7Dbody%7Bmargin%3A0%7Darticle%2Caside%2Cdetails%2Cfigcaption%2Cfigure%2Cfooter%2Cheader%2Chgroup%2Cmain%2Cmenu%2Cnav%2Csection%2Csummary%7Bdisplay%3Ablock%7Daudio%2Ccanvas%2Cprogress%2Cvideo%7Bdisplay%3Ainline%2Dblock%3Bvertical%2Dalign%3Abaseline%7Daudio%3Anot%28%5Bcontrols%5D%29%7Bdisplay%3Anone%3Bheight%3A0%7D%5Bhidden%5D%2Ctem [...]
+<script src="data:application/x-javascript;base64,LyohCiAqIEJvb3RzdHJhcCB2My4zLjUgKGh0dHA6Ly9nZXRib290c3RyYXAuY29tKQogKiBDb3B5cmlnaHQgMjAxMS0yMDE1IFR3aXR0ZXIsIEluYy4KICogTGljZW5zZWQgdW5kZXIgdGhlIE1JVCBsaWNlbnNlCiAqLwppZigidW5kZWZpbmVkIj09dHlwZW9mIGpRdWVyeSl0aHJvdyBuZXcgRXJyb3IoIkJvb3RzdHJhcCdzIEphdmFTY3JpcHQgcmVxdWlyZXMgalF1ZXJ5Iik7K2Z1bmN0aW9uKGEpeyJ1c2Ugc3RyaWN0Ijt2YXIgYj1hLmZuLmpxdWVyeS5zcGxpdCgiICIpWzBdLnNwbGl0KCIuIik7aWYoYlswXTwyJiZiWzFdPDl8fDE9PWJbMF0mJjk9PWJbMV0mJmJbMl08MSl0aHJvdy [...]
+<script src="data:application/x-javascript;base64,LyoqCiogQHByZXNlcnZlIEhUTUw1IFNoaXYgMy43LjIgfCBAYWZhcmthcyBAamRhbHRvbiBAam9uX25lYWwgQHJlbSB8IE1JVC9HUEwyIExpY2Vuc2VkCiovCi8vIE9ubHkgcnVuIHRoaXMgY29kZSBpbiBJRSA4CmlmICghIXdpbmRvdy5uYXZpZ2F0b3IudXNlckFnZW50Lm1hdGNoKCJNU0lFIDgiKSkgewohZnVuY3Rpb24oYSxiKXtmdW5jdGlvbiBjKGEsYil7dmFyIGM9YS5jcmVhdGVFbGVtZW50KCJwIiksZD1hLmdldEVsZW1lbnRzQnlUYWdOYW1lKCJoZWFkIilbMF18fGEuZG9jdW1lbnRFbGVtZW50O3JldHVybiBjLmlubmVySFRNTD0ieDxzdHlsZT4iK2IrIjwvc3R5bGU+IixkLm [...]
+<script src="data:application/x-javascript;base64,LyohIFJlc3BvbmQuanMgdjEuNC4yOiBtaW4vbWF4LXdpZHRoIG1lZGlhIHF1ZXJ5IHBvbHlmaWxsICogQ29weXJpZ2h0IDIwMTMgU2NvdHQgSmVobAogKiBMaWNlbnNlZCB1bmRlciBodHRwczovL2dpdGh1Yi5jb20vc2NvdHRqZWhsL1Jlc3BvbmQvYmxvYi9tYXN0ZXIvTElDRU5TRS1NSVQKICogICovCgovLyBPbmx5IHJ1biB0aGlzIGNvZGUgaW4gSUUgOAppZiAoISF3aW5kb3cubmF2aWdhdG9yLnVzZXJBZ2VudC5tYXRjaCgiTVNJRSA4IikpIHsKIWZ1bmN0aW9uKGEpeyJ1c2Ugc3RyaWN0IjthLm1hdGNoTWVkaWE9YS5tYXRjaE1lZGlhfHxmdW5jdGlvbihhKXt2YXIgYixjPWEuZG [...]
+<script src="data:application/x-javascript;base64,CgovKioKICogalF1ZXJ5IFBsdWdpbjogU3RpY2t5IFRhYnMKICoKICogQGF1dGhvciBBaWRhbiBMaXN0ZXIgPGFpZGFuQHBocC5uZXQ+CiAqIGFkYXB0ZWQgYnkgUnViZW4gQXJzbGFuIHRvIGFjdGl2YXRlIHBhcmVudCB0YWJzIHRvbwogKiBodHRwOi8vd3d3LmFpZGFubGlzdGVyLmNvbS8yMDE0LzAzL3BlcnNpc3RpbmctdGhlLXRhYi1zdGF0ZS1pbi1ib290c3RyYXAvCiAqLwooZnVuY3Rpb24oJCkgewogICJ1c2Ugc3RyaWN0IjsKICAkLmZuLnJtYXJrZG93blN0aWNreVRhYnMgPSBmdW5jdGlvbigpIHsKICAgIHZhciBjb250ZXh0ID0gdGhpczsKICAgIC8vIFNob3cgdGhlIHRhYi [...]
<link href="data:text/css;charset=utf-8,pre%20%2Eoperator%2C%0Apre%20%2Eparen%20%7B%0Acolor%3A%20rgb%28104%2C%20118%2C%20135%29%0A%7D%0Apre%20%2Eliteral%20%7B%0Acolor%3A%20%23990073%0A%7D%0Apre%20%2Enumber%20%7B%0Acolor%3A%20%23099%3B%0A%7D%0Apre%20%2Ecomment%20%7B%0Acolor%3A%20%23998%3B%0Afont%2Dstyle%3A%20italic%0A%7D%0Apre%20%2Ekeyword%20%7B%0Acolor%3A%20%23900%3B%0Afont%2Dweight%3A%20bold%0A%7D%0Apre%20%2Eidentifier%20%7B%0Acolor%3A%20rgb%280%2C%200%2C%200%29%3B%0A%7D%0Apre%20%2Estri [...]
<script src="data:application/x-javascript;base64,dmFyIGhsanM9bmV3IGZ1bmN0aW9uKCl7ZnVuY3Rpb24gbShwKXtyZXR1cm4gcC5yZXBsYWNlKC8mL2dtLCImYW1wOyIpLnJlcGxhY2UoLzwvZ20sIiZsdDsiKX1mdW5jdGlvbiBmKHIscSxwKXtyZXR1cm4gUmVnRXhwKHEsIm0iKyhyLmNJPyJpIjoiIikrKHA/ImciOiIiKSl9ZnVuY3Rpb24gYihyKXtmb3IodmFyIHA9MDtwPHIuY2hpbGROb2Rlcy5sZW5ndGg7cCsrKXt2YXIgcT1yLmNoaWxkTm9kZXNbcF07aWYocS5ub2RlTmFtZT09IkNPREUiKXtyZXR1cm4gcX1pZighKHEubm9kZVR5cGU9PTMmJnEubm9kZVZhbHVlLm1hdGNoKC9ccysvKSkpe2JyZWFrfX19ZnVuY3Rpb24gaCh0LH [...]
<style type="text/css">code{white-space: pre;}</style>
<style type="text/css">
- pre:not([class]) {
- background-color: white;
- }
+
</style>
<script type="text/javascript">
if (window.hljs && document.readyState && document.readyState === "complete") {
@@ -31,27 +38,88 @@ if (window.hljs && document.readyState && document.readyState === "complete") {
</script>
-<link href="data:text/css;charset=utf-8,body%2C%20td%20%7B%0Afont%2Dfamily%3A%20sans%2Dserif%3B%0Abackground%2Dcolor%3A%20white%3B%0Afont%2Dsize%3A%2013px%3B%0A%7D%0Abody%20%7B%0Amax%2Dwidth%3A%20800px%3B%0Amargin%3A%200%20auto%3B%0Apadding%3A%201em%201em%202em%3B%0Aline%2Dheight%3A%2020px%3B%0A%7D%0A%0Adiv%23TOC%20li%20%7B%0Alist%2Dstyle%3Anone%3B%0Abackground%2Dimage%3Anone%3B%0Abackground%2Drepeat%3Anone%3B%0Abackground%2Dposition%3A0%3B%0A%7D%0A%0Ap%2C%20pre%20%7B%20margin%3A%200em%2 [...]
-<script type="text/javascript">
-document.addEventListener("DOMContentLoaded", function() {
- var links = document.links;
- for (var i = 0, linksLength = links.length; i < linksLength; i++)
- if(links[i].hostname != window.location.hostname)
- links[i].target = '_blank';
-});
-</script>
+<style type="text/css">
+h1 {
+ font-size: 34px;
+}
+h1.title {
+ font-size: 38px;
+}
+h2 {
+ font-size: 30px;
+}
+h3 {
+ font-size: 24px;
+}
+h4 {
+ font-size: 18px;
+}
+h5 {
+ font-size: 16px;
+}
+h6 {
+ font-size: 12px;
+}
+.table th:not([align]) {
+ text-align: left;
+}
+</style>
+
+<link href="data:text/css;charset=utf-8,body%20%7B%0Amargin%3A%200px%20auto%3B%0Amax%2Dwidth%3A%201134px%3B%0Afont%2Dfamily%3A%20sans%2Dserif%3B%0Afont%2Dsize%3A%2010pt%3B%0A%7D%0A%0Adiv%23TOC%20ul%20%7B%0Apadding%3A%200px%200px%200px%2045px%3B%0Alist%2Dstyle%3A%20none%3B%0Abackground%2Dimage%3A%20none%3B%0Abackground%2Drepeat%3A%20none%3B%0Abackground%2Dposition%3A%200%3B%0Afont%2Dsize%3A%2010pt%3B%0Afont%2Dfamily%3A%20Helvetica%2C%20Arial%2C%20sans%2Dserif%3B%0A%7D%0Adiv%23TOC%20%3E%20 [...]
</head>
<body>
+<style type="text/css">
+.main-container {
+ max-width: 828px;
+ margin-left: auto;
+ margin-right: auto;
+}
+
+img {
+ max-width:100%;
+ height: auto;
+}
+.tabbed-pane {
+ padding-top: 12px;
+}
+button.code-folding-btn:focus {
+ outline: none;
+}
+</style>
+
+
+
+<div class="container-fluid main-container">
+
+<!-- tabsets -->
+<script>
+$(document).ready(function () {
+ window.buildTabsets("TOC");
+});
+</script>
+
+<!-- code folding -->
+
+
+
+
+
+
+<div class="fluid-row" id="header">
+
+
+
+<h1 class="title toc-ignore">The biomaRt users guide</h1>
+<p class="author-name">Steffen Durinck, Wolfgang Huber, Mike Smith</p>
+<h4 class="date"><em>30 October 2017</em></h4>
+<h4 class="package">Package</h4>
+<p>biomaRt 2.34.0</p>
-<div id="header">
-<h1 class="title">The biomaRt users guide</h1>
-<h4 class="author"><em>Steffen Durinck, Wolfgang Huber, Mike Smith</em></h4>
</div>
-<h4 class="package">Package version: <span style="font-weight: normal">biomaRt 2.32.1</span></h4>
<h1>Contents</h1>
<div id="TOC">
@@ -98,10 +166,10 @@ document.addEventListener("DOMContentLoaded", function() {
<pre class="r"><code>library("biomaRt")
listMarts()</code></pre>
<pre><code>## biomart version
-## 1 ENSEMBL_MART_ENSEMBL Ensembl Genes 89
-## 2 ENSEMBL_MART_MOUSE Mouse strains 89
-## 3 ENSEMBL_MART_SNP Ensembl Variation 89
-## 4 ENSEMBL_MART_FUNCGEN Ensembl Regulation 89</code></pre>
+## 1 ENSEMBL_MART_ENSEMBL Ensembl Genes 90
+## 2 ENSEMBL_MART_MOUSE Mouse strains 90
+## 3 ENSEMBL_MART_SNP Ensembl Variation 90
+## 4 ENSEMBL_MART_FUNCGEN Ensembl Regulation 90</code></pre>
<p>Note: if the function <code>useMart()</code> runs into proxy problems you should set your proxy first before calling any <em><a href="http://bioconductor.org/packages/biomaRt">biomaRt</a></em> functions.<br />
You can do this using the Sys.putenv command:</p>
<pre class="r"><code>Sys.setenv("http_proxy" = "http://my.proxy.org:9999")</code></pre>
@@ -111,76 +179,92 @@ You can do this using the Sys.putenv command:</p>
<pre class="r"><code>ensembl=useMart("ensembl")</code></pre>
<p>BioMart databases can contain several datasets, for Ensembl every species is a different dataset. In a next step we look at which datasets are available in the selected BioMart by using the function <code>listDatasets()</code>.</p>
<pre class="r"><code>listDatasets(ensembl)</code></pre>
-<pre><code>## dataset description version
-## 1 loculatus_gene_ensembl Spotted gar genes (LepOcu1) LepOcu1
-## 2 lafricana_gene_ensembl Elephant genes (Loxafr3.0) Loxafr3.0
-## 3 ocuniculus_gene_ensembl Rabbit genes (OryCun2.0) OryCun2.0
-## 4 acarolinensis_gene_ensembl Anole lizard genes (AnoCar2.0) AnoCar2.0
-## 5 aplatyrhynchos_gene_ensembl Duck genes (BGI_duck_1.0) BGI_duck_1.0
-## 6 mdomestica_gene_ensembl Opossum genes (monDom5) monDom5
-## 7 sharrisii_gene_ensembl Tasmanian devil genes (Devil_ref v7.0) Devil_ref v7.0
-## 8 oaries_gene_ensembl Sheep genes (Oar_v3.1) Oar_v3.1
-## 9 ggorilla_gene_ensembl Gorilla genes (gorGor3.1) gorGor3.1
-## 10 btaurus_gene_ensembl Cow genes (UMD3.1) UMD3.1
-## 11 ecaballus_gene_ensembl Horse genes (Equ Cab 2) Equ Cab 2
-## 12 gaculeatus_gene_ensembl Stickleback genes (BROAD S1) BROAD S1
-## 13 tbelangeri_gene_ensembl Tree Shrew genes (tupBel1) tupBel1
-## 14 choffmanni_gene_ensembl Sloth genes (choHof1) choHof1
-## 15 cporcellus_gene_ensembl Guinea Pig genes (cavPor3) cavPor3
-## 16 tnigroviridis_gene_ensembl Tetraodon genes (TETRAODON 8.0) TETRAODON 8.0
-## 17 ogarnettii_gene_ensembl Bushbaby genes (OtoGar3) OtoGar3
-## 18 csabaeus_gene_ensembl Vervet-AGM genes (ChlSab1.1) ChlSab1.1
-## 19 pabelii_gene_ensembl Orangutan genes (PPYG2) PPYG2
-## 20 etelfairi_gene_ensembl Lesser hedgehog tenrec genes (TENREC) TENREC
-## 21 sscrofa_gene_ensembl Pig genes (Sscrofa10.2) Sscrofa10.2
-## 22 olatipes_gene_ensembl Medaka genes (HdrR) HdrR
-## 23 pformosa_gene_ensembl Amazon molly genes (Poecilia_formosa-5.1.2) Poecilia_formosa-5.1.2
-## 24 mfuro_gene_ensembl Ferret genes (MusPutFur1.0) MusPutFur1.0
-## 25 dnovemcinctus_gene_ensembl Armadillo genes (Dasnov3.0) Dasnov3.0
-## 26 pmarinus_gene_ensembl Lamprey genes (Pmarinus_7.0) Pmarinus_7.0
-## 27 eeuropaeus_gene_ensembl Hedgehog genes (eriEur1) eriEur1
-## 28 mgallopavo_gene_ensembl Turkey genes (Turkey_2.01) Turkey_2.01
-## 29 tguttata_gene_ensembl Zebra Finch genes (taeGut3.2.4) taeGut3.2.4
-## 30 gmorhua_gene_ensembl Cod genes (gadMor1) gadMor1
-## 31 itridecemlineatus_gene_ensembl Squirrel genes (spetri2) spetri2
-## 32 pcapensis_gene_ensembl Hyrax genes (proCap1) proCap1
-## 33 nleucogenys_gene_ensembl Gibbon genes (Nleu1.0) Nleu1.0
-## 34 pvampyrus_gene_ensembl Megabat genes (pteVam1) pteVam1
-## 35 vpacos_gene_ensembl Alpaca genes (vicPac1) vicPac1
-## 36 oprinceps_gene_ensembl Pika genes (OchPri2.0-Ens) OchPri2.0-Ens
-## 37 mlucifugus_gene_ensembl Microbat genes (Myoluc2.0) Myoluc2.0
-## 38 ggallus_gene_ensembl Chicken genes (Gallus_gallus-5.0) Gallus_gallus-5.0
-## 39 dordii_gene_ensembl Kangaroo rat genes (dipOrd1) dipOrd1
-## 40 ptroglodytes_gene_ensembl Chimpanzee genes (CHIMP2.1.4) CHIMP2.1.4
-## 41 lchalumnae_gene_ensembl Coelacanth genes (LatCha1) LatCha1
-## 42 saraneus_gene_ensembl Shrew genes (sorAra1) sorAra1
-## 43 amelanoleuca_gene_ensembl Panda genes (ailMel1) ailMel1
-## 44 oniloticus_gene_ensembl Tilapia genes (Orenil1.0) Orenil1.0
-## 45 trubripes_gene_ensembl Fugu genes (FUGU 4.0) FUGU 4.0
-## 46 cfamiliaris_gene_ensembl Dog genes (CanFam3.1) CanFam3.1
-## 47 mmulatta_gene_ensembl Macaque genes (Mmul_8.0.1) Mmul_8.0.1
-## 48 panubis_gene_ensembl Olive baboon genes (PapAnu2.0) PapAnu2.0
-## 49 fcatus_gene_ensembl Cat genes (Felis_catus_6.2) Felis_catus_6.2
-## 50 neugenii_gene_ensembl Wallaby genes (Meug_1.0) Meug_1.0
-## 51 csavignyi_gene_ensembl C.savignyi genes (CSAV 2.0) CSAV 2.0
-## 52 dmelanogaster_gene_ensembl Fruitfly genes (BDGP6) BDGP6
-## 53 cintestinalis_gene_ensembl C.intestinalis genes (KH) KH
-## 54 xmaculatus_gene_ensembl Platyfish genes (Xipmac4.4.2) Xipmac4.4.2
-## 55 mmurinus_gene_ensembl Mouse Lemur genes (Mmur_2.0) Mmur_2.0
-## 56 hsapiens_gene_ensembl Human genes (GRCh38.p10) GRCh38.p10
-## 57 csyrichta_gene_ensembl Tarsier genes (tarSyr1) tarSyr1
-## 58 celegans_gene_ensembl Caenorhabditis elegans genes (WBcel235) WBcel235
-## 59 psinensis_gene_ensembl Chinese softshell turtle genes (PelSin_1.0) PelSin_1.0
-## 60 rnorvegicus_gene_ensembl Rat genes (Rnor_6.0) Rnor_6.0
-## 61 cjacchus_gene_ensembl Marmoset genes (C_jacchus3.2.1) C_jacchus3.2.1
-## 62 oanatinus_gene_ensembl Platypus genes (OANA5) OANA5
-## 63 ttruncatus_gene_ensembl Dolphin genes (turTru1) turTru1
-## 64 amexicanus_gene_ensembl Cave fish genes (AstMex102) AstMex102
-## 65 scerevisiae_gene_ensembl Saccharomyces cerevisiae genes (R64-1-1) R64-1-1
-## 66 drerio_gene_ensembl Zebrafish genes (GRCz10) GRCz10
-## 67 xtropicalis_gene_ensembl Xenopus genes (JGI 4.2) JGI 4.2
-## 68 mmusculus_gene_ensembl Mouse genes (GRCm38.p5) GRCm38.p5
-## 69 falbicollis_gene_ensembl Flycatcher genes (FicAlb_1.4) FicAlb_1.4</code></pre>
+<pre><code>## dataset description version
+## 1 ngalili_gene_ensembl Upper Galilee mountains blind mole rat genes (S.galili_v1.0) S.galili_v1.0
+## 2 oprinceps_gene_ensembl Pika genes (OchPri2.0-Ens) OchPri2.0-Ens
+## 3 hfemale_gene_ensembl Naked mole-rat female genes (HetGla_female_1.0) HetGla_female_1.0
+## 4 pbairdii_gene_ensembl Northern American deer mouse genes (Pman_1.0) Pman_1.0
+## 5 mmurinus_gene_ensembl Mouse Lemur genes (Mmur_2.0) Mmur_2.0
+## 6 mfuro_gene_ensembl Ferret genes (MusPutFur1.0) MusPutFur1.0
+## 7 trubripes_gene_ensembl Fugu genes (FUGU 4.0) FUGU 4.0
+## 8 cporcellus_gene_ensembl Guinea Pig genes (Cavpor3.0) Cavpor3.0
+## 9 saraneus_gene_ensembl Shrew genes (sorAra1) sorAra1
+## 10 fdamarensis_gene_ensembl Damara mole rat genes (DMR_v1.0) DMR_v1.0
+## 11 gmorhua_gene_ensembl Cod genes (gadMor1) gadMor1
+## 12 mochrogaster_gene_ensembl Prairie vole genes (MicOch1.0) MicOch1.0
+## 13 cjacchus_gene_ensembl Marmoset genes (C_jacchus3.2.1) C_jacchus3.2.1
+## 14 lafricana_gene_ensembl Elephant genes (Loxafr3.0) Loxafr3.0
+## 15 btaurus_gene_ensembl Cow genes (UMD3.1) UMD3.1
+## 16 mcaroli_gene_ensembl Ryukyu mouse genes (CAROLI_EIJ_v1.1) CAROLI_EIJ_v1.1
+## 17 pmarinus_gene_ensembl Lamprey genes (Pmarinus_7.0) Pmarinus_7.0
+## 18 ogarnettii_gene_ensembl Bushbaby genes (OtoGar3) OtoGar3
+## 19 csyrichta_gene_ensembl Tarsier genes (tarSyr1) tarSyr1
+## 20 gaculeatus_gene_ensembl Stickleback genes (BROAD S1) BROAD S1
+## 21 etelfairi_gene_ensembl Lesser hedgehog tenrec genes (TENREC) TENREC
+## 22 ttruncatus_gene_ensembl Dolphin genes (turTru1) turTru1
+## 23 ecaballus_gene_ensembl Horse genes (Equ Cab 2) Equ Cab 2
+## 24 mmusculus_gene_ensembl Mouse genes (GRCm38.p5) GRCm38.p5
+## 25 pabelii_gene_ensembl Orangutan genes (PPYG2) PPYG2
+## 26 drerio_gene_ensembl Zebrafish genes (GRCz10) GRCz10
+## 27 oniloticus_gene_ensembl Tilapia genes (Orenil1.0) Orenil1.0
+## 28 mdomestica_gene_ensembl Opossum genes (monDom5) monDom5
+## 29 cintestinalis_gene_ensembl C.intestinalis genes (KH) KH
+## 30 panubis_gene_ensembl Olive baboon genes (PapAnu2.0) PapAnu2.0
+## 31 mgallopavo_gene_ensembl Turkey genes (Turkey_2.01) Turkey_2.01
+## 32 olatipes_gene_ensembl Medaka genes (HdrR) HdrR
+## 33 oanatinus_gene_ensembl Platypus genes (OANA5) OANA5
+## 34 ocuniculus_gene_ensembl Rabbit genes (OryCun2.0) OryCun2.0
+## 35 jjaculus_gene_ensembl Lesser Egyptian jerboa genes (JacJac1.0) JacJac1.0
+## 36 rnorvegicus_gene_ensembl Rat genes (Rnor_6.0) Rnor_6.0
+## 37 amelanoleuca_gene_ensembl Panda genes (ailMel1) ailMel1
+## 38 csavignyi_gene_ensembl C.savignyi genes (CSAV 2.0) CSAV 2.0
+## 39 mauratus_gene_ensembl Golden Hamster genes (MesAur1.0) MesAur1.0
+## 40 hmale_gene_ensembl Naked mole-rat male genes (HetGla_1.0) HetGla_1.0
+## 41 oaries_gene_ensembl Sheep genes (Oar_v3.1) Oar_v3.1
+## 42 tnigroviridis_gene_ensembl Tetraodon genes (TETRAODON 8.0) TETRAODON 8.0
+## 43 cchok1gshd_gene_ensembl Chinese hamster CHOK1GS genes (CHOK1GS_HDv1) CHOK1GS_HDv1
+## 44 itridecemlineatus_gene_ensembl Squirrel genes (SpeTri2.0) SpeTri2.0
+## 45 ptroglodytes_gene_ensembl Chimpanzee genes (CHIMP2.1.4) CHIMP2.1.4
+## 46 xtropicalis_gene_ensembl Xenopus genes (JGI 4.2) JGI 4.2
+## 47 odegus_gene_ensembl Degu genes (OctDeg1.0) OctDeg1.0
+## 48 choffmanni_gene_ensembl Sloth genes (choHof1) choHof1
+## 49 dmelanogaster_gene_ensembl Fruitfly genes (BDGP6) BDGP6
+## 50 tguttata_gene_ensembl Zebra Finch genes (taeGut3.2.4) taeGut3.2.4
+## 51 vpacos_gene_ensembl Alpaca genes (vicPac1) vicPac1
+## 52 falbicollis_gene_ensembl Flycatcher genes (FicAlb_1.4) FicAlb_1.4
+## 53 acarolinensis_gene_ensembl Anole lizard genes (AnoCar2.0) AnoCar2.0
+## 54 caperea_gene_ensembl Brazilian guinea pig genes (CavAp1.0) CavAp1.0
+## 55 dnovemcinctus_gene_ensembl Armadillo genes (Dasnov3.0) Dasnov3.0
+## 56 ggallus_gene_ensembl Chicken genes (Gallus_gallus-5.0) Gallus_gallus-5.0
+## 57 pvampyrus_gene_ensembl Megabat genes (pteVam1) pteVam1
+## 58 aplatyrhynchos_gene_ensembl Duck genes (BGI_duck_1.0) BGI_duck_1.0
+## 59 mmulatta_gene_ensembl Macaque genes (Mmul_8.0.1) Mmul_8.0.1
+## 60 neugenii_gene_ensembl Wallaby genes (Meug_1.0) Meug_1.0
+## 61 mlucifugus_gene_ensembl Microbat genes (Myoluc2.0) Myoluc2.0
+## 62 xmaculatus_gene_ensembl Platyfish genes (Xipmac4.4.2) Xipmac4.4.2
+## 63 csabaeus_gene_ensembl Vervet-AGM genes (ChlSab1.1) ChlSab1.1
+## 64 hsapiens_gene_ensembl Human genes (GRCh38.p10) GRCh38.p10
+## 65 pformosa_gene_ensembl Amazon molly genes (Poecilia_formosa-5.1.2) Poecilia_formosa-5.1.2
+## 66 psinensis_gene_ensembl Chinese softshell turtle genes (PelSin_1.0) PelSin_1.0
+## 67 scerevisiae_gene_ensembl Saccharomyces cerevisiae genes (R64-1-1) R64-1-1
+## 68 lchalumnae_gene_ensembl Coelacanth genes (LatCha1) LatCha1
+## 69 fcatus_gene_ensembl Cat genes (Felis_catus_6.2) Felis_catus_6.2
+## 70 dordii_gene_ensembl Kangaroo rat genes (Dord_2.0) Dord_2.0
+## 71 amexicanus_gene_ensembl Cave fish genes (AstMex102) AstMex102
+## 72 tbelangeri_gene_ensembl Tree Shrew genes (tupBel1) tupBel1
+## 73 celegans_gene_ensembl Caenorhabditis elegans genes (WBcel235) WBcel235
+## 74 nleucogenys_gene_ensembl Gibbon genes (Nleu1.0) Nleu1.0
+## 75 pcapensis_gene_ensembl Hyrax genes (proCap1) proCap1
+## 76 ccrigri_gene_ensembl Chinese hamster CriGri genes (CriGri_1.0) CriGri_1.0
+## 77 eeuropaeus_gene_ensembl Hedgehog genes (eriEur1) eriEur1
+## 78 clanigera_gene_ensembl Long-tailed chinchilla genes (ChiLan1.0) ChiLan1.0
+## 79 mpahari_gene_ensembl Shrew mouse genes (PAHARI_EIJ_v1.1) PAHARI_EIJ_v1.1
+## 80 loculatus_gene_ensembl Spotted gar genes (LepOcu1) LepOcu1
+## 81 ggorilla_gene_ensembl Gorilla genes (gorGor3.1) gorGor3.1
+## 82 sscrofa_gene_ensembl Pig genes (Sscrofa11.1) Sscrofa11.1
+## 83 cfamiliaris_gene_ensembl Dog genes (CanFam3.1) CanFam3.1
+## 84 sharrisii_gene_ensembl Tasmanian devil genes (Devil_ref v7.0) Devil_ref v7.0
+## 85 mspreteij_gene_ensembl Algerian mouse genes (SPRET_EiJ_v1) SPRET_EiJ_v1</code></pre>
<p>To select a dataset we can update the <code>Mart</code> object using the function <code>useDataset()</code>. In the example below we choose to use the hsapiens dataset.</p>
<pre class="r"><code>ensembl = useDataset("hsapiens_gene_ensembl",mart=ensembl)</code></pre>
<p>Or alternatively if the dataset one wants to use is known in advance, we can select a BioMart database and dataset in one step by:</p>
@@ -200,12 +284,12 @@ filters[1:5,]</code></pre>
<p><em>Attributes</em> define the values we are interested in to retrieve. For example we want to retrieve the gene symbols or chromosomal coordinates. The <code>listAttributes()</code> function displays all available attributes in the selected dataset.</p>
<pre class="r"><code>attributes = listAttributes(ensembl)
attributes[1:5,]</code></pre>
-<pre><code>## name description page
-## 1 ensembl_gene_id Gene stable ID feature_page
-## 2 ensembl_transcript_id Transcript stable ID feature_page
-## 3 ensembl_peptide_id Protein stable ID feature_page
-## 4 ensembl_exon_id Exon stable ID feature_page
-## 5 description Gene description feature_page</code></pre>
+<pre><code>## name description page
+## 1 ensembl_gene_id Gene stable ID feature_page
+## 2 ensembl_gene_id_version Gene stable ID version feature_page
+## 3 ensembl_transcript_id Transcript stable ID feature_page
+## 4 ensembl_transcript_id_version Transcript stable ID version feature_page
+## 5 ensembl_peptide_id Protein stable ID feature_page</code></pre>
<p>The <code>getBM()</code> function is the main query function in <em><a href="http://bioconductor.org/packages/biomaRt">biomaRt</a></em>. It has four main arguments:</p>
<ul>
<li><code>attributes</code>: is a vector of attributes that one wants to retrieve (= the output of the query).</li>
@@ -253,12 +337,12 @@ goids = getBM(attributes = c('entrezgene', 'go_id'),
mart = ensembl)
head(goids)</code></pre>
<pre><code>## entrezgene go_id
-## 1 673 GO:0044297
-## 2 673 GO:0043005
-## 3 673 GO:0016020
-## 4 673 GO:0005886
-## 5 673 GO:0005739
-## 6 673 GO:0005737</code></pre>
+## 1 673 GO:0000166
+## 2 673 GO:0004672
+## 3 673 GO:0004674
+## 4 673 GO:0005524
+## 5 673 GO:0006468
+## 6 673 GO:0010628</code></pre>
</div>
<div id="retrieve-all-hugo-gene-symbols-of-genes-that-are-located-on-chromosomes-1720-or-y-and-are-associated-with-specific-go-terms" class="section level2">
<h2><span class="header-section-number">4.3</span> Retrieve all HUGO gene symbols of genes that are located on chromosomes 17,20 or Y, and are associated with specific GO terms</h2>
@@ -333,20 +417,20 @@ ipro</code></pre>
values = 'GO:0004707',
mart = ensembl)</code></pre>
<pre><code>## entrezgene hgnc_symbol
-## 1 1432 MAPK14
-## 2 5596 MAPK4
-## 3 225689 MAPK15
-## 4 5603 MAPK13
-## 5 5601 MAPK9
+## 1 225689 MAPK15
+## 2 5594 MAPK1
+## 3 5595 MAPK3
+## 4 6300 MAPK12
+## 5 5600 MAPK11
## 6 51701 NLK
-## 7 5594 MAPK1
-## 8 5599 MAPK8
-## 9 5602 MAPK10
-## 10 6300 MAPK12
+## 7 5598 MAPK7
+## 8 5596 MAPK4
+## 9 1432 MAPK14
+## 10 5603 MAPK13
## 11 5597 MAPK6
-## 12 5600 MAPK11
-## 13 5598 MAPK7
-## 14 5595 MAPK3</code></pre>
+## 12 5599 MAPK8
+## 13 5601 MAPK9
+## 14 5602 MAPK10</code></pre>
</div>
<div id="given-a-set-of-entrezgene-identifiers-retrieve-100bp-upstream-promoter-sequences" class="section level2">
<h2><span class="header-section-number">4.7</span> Given a set of EntrezGene identifiers, retrieve 100bp upstream promoter sequences</h2>
@@ -376,7 +460,10 @@ getSequence(id = entrez,
seqType="coding_gene_flank",
upstream=100,
mart=ensembl) </code></pre>
-<pre><code>## Error in getBM(c(seqType, type), filters = c(type, "upstream_flank"), : Query ERROR: caught BioMart::Exception::Usage: Filter upstream_flank NOT FOUND</code></pre>
+<pre><code>## coding_gene_flank entrezgene
+## 1 CCTCCGCCTCCGCCTCCGCCTCCGCCTCCCCCAGCTCTCCGCCTCCCTTCCCCCTCCCCGCCCGACAGCGGCCGCTCGGGCCCCGGCTCTCGGTTATAAG 673
+## 2 CACGTTTCCGCCCTTTGCAATAAGGAAATACATAGTTTACTTTCATTTTTGACTCTGAGGCTCTTTCCAACGCTGTAAAAAAGGACAGAGGCTGTTCCCT 837
+## 3 TCCTTCTCTGCAGGCCCAGGTGACCCAGGGTTGGAAGTGTCTCATGCTGGATCCCCACTTTTCCTCTTGCAGCAGCCAGACTGCCTTCCGGGTCACTGCC 7157</code></pre>
</div>
<div id="retrieve-all-5-utr-sequences-of-all-genes-that-are-located-on-chromosome-3-between-the-positions-185514033-and-185535839" class="section level2">
<h2><span class="header-section-number">4.8</span> Retrieve all 5’ UTR sequences of all genes that are located on chromosome 3 between the positions 185,514,033 and 185,535,839</h2>
@@ -387,10 +474,10 @@ getSequence(id = entrez,
mart=ensembl)
utr5</code></pre>
<pre><code>## 5utr
-## 1 ATTCTTGTGAATGTGACACACGATCTCTCCAGTTTCCAT
-## 2 Sequence unavailable
-## 3 TGAGCAAAATCCCACAGTGGAAACTCTTAAGCCTCTGCGAAGTAAATCATTCTTGTGAATGTGACACACGATCTCTCCAGTTTCCAT
-## 4 AGTCCCTAGGGAACTTCCTGTTGTCACCACACCTCTGAGTCGTCTGAGCTCACTGTGAGCAAAATCCCACAGTGGAAACTCTTAAGCCTCTGCGAAGTAAATCATTCTTGTGAATGTGACACACGATCTCTCCAGTTTCCAT
+## 1 TGAGCAAAATCCCACAGTGGAAACTCTTAAGCCTCTGCGAAGTAAATCATTCTTGTGAATGTGACACACGATCTCTCCAGTTTCCAT
+## 2 ATTCTTGTGAATGTGACACACGATCTCTCCAGTTTCCAT
+## 3 AGTCCCTAGGGAACTTCCTGTTGTCACCACACCTCTGAGTCGTCTGAGCTCACTGTGAGCAAAATCCCACAGTGGAAACTCTTAAGCCTCTGCGAAGTAAATCATTCTTGTGAATGTGACACACGATCTCTCCAGTTTCCAT
+## 4 Sequence unavailable
## entrezgene
## 1 200879
## 2 200879
@@ -406,23 +493,23 @@ utr5</code></pre>
mart=ensembl)
protein</code></pre>
<pre><code>## peptide
-## 1 ALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVS*
-## 2 MAQTPAFDKPKVELHVHLDGSIKPETILYYGRRRGIALPANTAEGLLNVIGMDKPLTLPDFLAKFDYYMPAIAGCREAIKRIAYEFVEMKAKEGVVYVEVRYSPHLLANSKVEPIPWNQAEGDLTPDEVVALVGQGLQEGERDFGVKARSILCCMRHQPNWSPKVVELCKKYQQQTVVAIDLAGDETIPGSSLLPGHVQAYQEAVKSGIHRTVHAGEVGSAEVVKEAVDILKTERLGHGYHTLEDQALYNRLRQENMHFEAQK*
-## 3 MAQTPAFDKPKVELHVHLDGSIKPETILYYGRRRGIALPANTAEGLLNVIGMDKPLTLPDFLAKFDYYMPAIAGCREAIKRIAYEFVEMKAKEGVVYVEVRYSPHLLANSKVEPIPWNQAEGDLTPDEVVALVGQGLQEGERDFGVKARSILCCMRHQPNWSPKVVELCKKYQQQTVVAIDLAGDETIPGSSLLPGHVQAYQEAVKSGIHRTVHAGEVGSAEVVKEAVDILKTERLGHGYHTLEDQALYNRLRQENMHFEICPWSSYLTGAWKPDTEHAVIRLKNDQANYSLNTDDPLIFKSTLDTDYQMTKRDMGFTEEEFKRLNINAAKSSFLPEDEKRELLDLLYKAYGMPPSASAGQNL*
-## 4 Sequence unavailable
-## 5 Sequence unavailable
-## 6 MAQTPAFDKPKVELHVHLDGSIKPETILYYGRRRGIALPANTAEGLLNVIGMDKPLTLPDFLAKFDYYMPAIARL*
-## 7 MAQTPAFDKPKVELHVHLDGSIKPETILYYGRRRGIALPANTAEGLLNVIGMDKPLTLPDFLAKFDYYMPAIAGCREAIKRIAYEFVEMKAKEGVVYVEVRYSPHLLANSKVEPIPWNQAEGDLTPDEVVALVGQGLQEGERDFGVKARSILCCMRHQPNWSPKVVELCKKYQQQTVVAIDLAGDETIPGSSLLPGHVQAYQAVDILKTERLGHGYHTLEDQALYNRLRQENMHFEICPWSSYLTGAWKPDTEHAVIRLKNDQANYSLNTDDPLIFKSTLDTDYQMTKRDMGFTEEEFKRLNINAAKSSFLPEDEKRELLDLLYKAYGMPPSASAGQNL*
-## 8 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV*
+## 1 Sequence unavailable
+## 2 MAQTPAFDKPKVELHVHLDGSIKPETILYYGRRRGIALPANTAEGLLNVIGMDKPLTLPDFLAKFDYYMPAIAGCREAIKRIAYEFVEMKAKEGVVYVEVRYSPHLLANSKVEPIPWNQAEGDLTPDEVVALVGQGLQEGERDFGVKARSILCCMRHQPNWSPKVVELCKKYQQQTVVAIDLAGDETIPGSSLLPGHVQAYQAVDILKTERLGHGYHTLEDQALYNRLRQENMHFEICPWSSYLTGAWKPDTEHAVIRLKNDQANYSLNTDDPLIFKSTLDTDYQMTKRDMGFTEEEFKRLNINAAKSSFLPEDEKRELLDLLYKAYGMPPSASAGQNL*
+## 3 Sequence unavailable
+## 4 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV*
+## 5 MAQTPAFDKPKVELHVHLDGSIKPETILYYGRRRGIALPANTAEGLLNVIGMDKPLTLPDFLAKFDYYMPAIARL*
+## 6 ALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVS*
+## 7 MAQTPAFDKPKVELHVHLDGSIKPETILYYGRRRGIALPANTAEGLLNVIGMDKPLTLPDFLAKFDYYMPAIAGCREAIKRIAYEFVEMKAKEGVVYVEVRYSPHLLANSKVEPIPWNQAEGDLTPDEVVALVGQGLQEGERDFGVKARSILCCMRHQPNWSPKVVELCKKYQQQTVVAIDLAGDETIPGSSLLPGHVQAYQEAVKSGIHRTVHAGEVGSAEVVKEAVDILKTERLGHGYHTLEDQALYNRLRQENMHFEICPWSSYLTGAWKPDTEHAVIRLKNDQANYSLNTDDPLIFKSTLDTDYQMTKRDMGFTEEEFKRLNINAAKSSFLPEDEKRELLDLLYKAYGMPPSASAGQNL*
+## 8 MAQTPAFDKPKVELHVHLDGSIKPETILYYGRRRGIALPANTAEGLLNVIGMDKPLTLPDFLAKFDYYMPAIAGCREAIKRIAYEFVEMKAKEGVVYVEVRYSPHLLANSKVEPIPWNQAEGDLTPDEVVALVGQGLQEGERDFGVKARSILCCMRHQPNWSPKVVELCKKYQQQTVVAIDLAGDETIPGSSLLPGHVQAYQEAVKSGIHRTVHAGEVGSAEVVKEAVDILKTERLGHGYHTLEDQALYNRLRQENMHFEAQK*
## entrezgene
-## 1 5728
+## 1 100
## 2 100
-## 3 100
+## 3 5728
## 4 5728
## 5 100
-## 6 100
+## 6 5728
## 7 100
-## 8 5728</code></pre>
+## 8 100</code></pre>
</div>
<div id="retrieve-known-snps-located-on-the-human-chromosome-8-between-positions-148350-and-148612" class="section level2">
<h2><span class="header-section-number">4.10</span> Retrieve known SNPs located on the human chromosome 8 between positions 148350 and 148612</h2>
@@ -479,6 +566,11 @@ listMarts(archive = TRUE)
```
```
+## Warning in listMarts(archive = TRUE): The archive = TRUE argument is now deprecated.
+## Use listEnsemblMarts() to find the URL to directly query an Ensembl archive.
+```
+
+```
## biomart version
## 1 ensembl_mart_51 Ensembl 51
## 2 snp_mart_51 SNP 51
@@ -533,6 +625,11 @@ ensembl = useMart("ensembl_mart_46", dataset="hsapiens_gene_ensembl", archive =
```
```
+## Warning in listMarts(host = host, path = path, port = port, includeHosts = TRUE, : The archive = TRUE argument is now deprecated.
+## Use listEnsemblMarts() to find the URL to directly query an Ensembl archive.
+```
+
+```
## Note: requested host was redirected from www.ensembl.org to http://aug2007.archive.ensembl.org:80/biomart/martservice
## When using archived Ensembl versions this sometimes can result in connecting to a newer version than the intended Ensembl version
## Check your ensembl version using listMarts(mart)
@@ -547,8 +644,32 @@ After you selected the BioMart database and dataset, queries can be performed in
## Accessing archives through specifying the archive host
-->
-<p>Use the <a href="http://www.ensembl.org" class="uri">http://www.ensembl.org</a> website and go down the bottom of the page. Click on ‘view in Archive’ and select the archive you need. Copy the url and use that url as shown below to connect to the specified BioMart database. The example below shows how to query Ensembl 54.</p>
-<pre class="r"><code>listMarts(host='may2009.archive.ensembl.org')</code></pre>
+<p><em><a href="http://bioconductor.org/packages/biomaRt">biomaRt</a></em> provides the function <code>listEnsemblArchives()</code> to view the available archives. This function takes no arguments, and produces a table containing the names of the available archived versions, the date they were first available, and the URL where they can be accessed.</p>
+<pre class="r"><code>listEnsemblArchives()</code></pre>
+<pre><code>## version date url
+## [1,] "Ensembl GRCh37" "Feb 2014" "http://grch37.ensembl.org"
+## [2,] "Ensembl 89" "May 2017" "http://May2017.archive.ensembl.org"
+## [3,] "Ensembl 88" "Mar 2017" "http://Mar2017.archive.ensembl.org"
+## [4,] "Ensembl 87" "Dec 2016" "http://Dec2016.archive.ensembl.org"
+## [5,] "Ensembl 86" "Oct 2016" "http://Oct2016.archive.ensembl.org"
+## [6,] "Ensembl 85" "Jul 2016" "http://Jul2016.archive.ensembl.org"
+## [7,] "Ensembl 84" "Mar 2016" "http://Mar2016.archive.ensembl.org"
+## [8,] "Ensembl 83" "Dec 2015" "http://Dec2015.archive.ensembl.org"
+## [9,] "Ensembl 82" "Sep 2015" "http://Sep2015.archive.ensembl.org"
+## [10,] "Ensembl 81" "Jul 2015" "http://Jul2015.archive.ensembl.org"
+## [11,] "Ensembl 80" "May 2015" "http://May2015.archive.ensembl.org"
+## [12,] "Ensembl 79" "Mar 2015" "http://Mar2015.archive.ensembl.org"
+## [13,] "Ensembl 78" "Dec 2014" "http://Dec2014.archive.ensembl.org"
+## [14,] "Ensembl 77" "Oct 2014" "http://Oct2014.archive.ensembl.org"
+## [15,] "Ensembl 76" "Aug 2014" "http://Aug2014.archive.ensembl.org"
+## [16,] "Ensembl 75" "Feb 2014" "http://Feb2014.archive.ensembl.org"
+## [17,] "Ensembl 74" "Dec 2013" "http://Dec2013.archive.ensembl.org"
+## [18,] "Ensembl 67" "May 2012" "http://May2012.archive.ensembl.org"
+## [19,] "Ensembl 54" "May 2009" "http://May2009.archive.ensembl.org"</code></pre>
+<p>Alternatively, one can use the <a href="http://www.ensembl.org" class="uri">http://www.ensembl.org</a> website to find archived version. From the main page scroll down the bottom of the page, click on ‘view in Archive’ and select the archive you need.</p>
+<p><em>You will notice that there is an archive URL even for the current release of Ensembl. It can be useful to use this if you wish to ensure that script you write now will return exactly the same results in the future. Using <code>www.ensembl.org</code> will always access the current release, and so the data retrieved may change over time as new releases come out.</em></p>
+<p>Whichever method you use to find the URL of the archive you wish to query, copy the url and use that in the <code>host</code> argument as shown below to connect to the specified BioMart database. The example below shows how to query Ensembl 54.</p>
+<pre class="r"><code>listMarts(host = 'may2009.archive.ensembl.org')</code></pre>
<pre><code>## biomart version
## 1 ENSEMBL_MART_ENSEMBL Ensembl 54
## 2 ENSEMBL_MART_SNP Ensembl Variation 54
@@ -556,9 +677,9 @@ After you selected the BioMart database and dataset, queries can be performed in
## 4 REACTOME Reactome(CSHL US)
## 5 wormbase_current WormBase (CSHL US)
## 6 pride PRIDE (EBI UK)</code></pre>
-<pre class="r"><code>ensembl54=useMart(host='may2009.archive.ensembl.org',
- biomart='ENSEMBL_MART_ENSEMBL',
- dataset='hsapiens_gene_ensembl')</code></pre>
+<pre class="r"><code>ensembl54 <- useMart(host='may2009.archive.ensembl.org',
+ biomart='ENSEMBL_MART_ENSEMBL',
+ dataset='hsapiens_gene_ensembl')</code></pre>
</div>
<div id="using-a-biomart-other-than-ensembl" class="section level1">
<h1><span class="header-section-number">6</span> Using a BioMart other than Ensembl</h1>
@@ -618,7 +739,7 @@ head(listFilters(wormbase))</code></pre>
<h3><span class="header-section-number">7.2.2</span> filterOptions</h3>
<p>Some filters have a limited set of values that can be given to them. To know which values these are one can use the <code>filterOptions()</code> function to retrieve the predetermed values of the respective filter.</p>
<pre class="r"><code>filterOptions("biotype",ensembl)</code></pre>
-<pre><code>## [1] "[3prime_overlapping_ncRNA,antisense,bidirectional_promoter_lncRNA,IG_C_gene,IG_C_pseudogene,IG_D_gene,IG_J_gene,IG_J_pseudogene,IG_pseudogene,IG_V_gene,IG_V_pseudogene,lincRNA,macro_lncRNA,miRNA,misc_RNA,Mt_rRNA,Mt_tRNA,non_coding,polymorphic_pseudogene,processed_pseudogene,processed_transcript,protein_coding,pseudogene,ribozyme,rRNA,scaRNA,scRNA,sense_intronic,sense_overlapping,snoRNA,snRNA,sRNA,TEC,transcribed_processed_pseudogene,transcribed_unitary_pseudogene, [...]
+<pre><code>## [1] "[3prime_overlapping_ncRNA,antisense_RNA,bidirectional_promoter_lncRNA,IG_C_gene,IG_C_pseudogene,IG_D_gene,IG_J_gene,IG_J_pseudogene,IG_pseudogene,IG_V_gene,IG_V_pseudogene,lincRNA,macro_lncRNA,miRNA,misc_RNA,Mt_rRNA,Mt_tRNA,non_coding,polymorphic_pseudogene,processed_pseudogene,processed_transcript,protein_coding,pseudogene,ribozyme,rRNA,scaRNA,scRNA,sense_intronic,sense_overlapping,snoRNA,snRNA,sRNA,TEC,transcribed_processed_pseudogene,transcribed_unitary_pseudog [...]
<p>If there are no predetermed values e.g. for the entrezgene filter, then <code>filterOptions()</code> will return the type of filter it is. And most of the times the filter name or it’s description will suggest what values one case use for the respective filter (e.g. entrezgene filter will work with enterzgene identifiers as values)</p>
</div>
</div>
@@ -630,13 +751,13 @@ pages</code></pre>
<pre><code>## [1] "feature_page" "structure" "homologs" "snp" "snp_somatic" "sequences"</code></pre>
<p>To show us a smaller list of attributes which belong to a specific page, we can now specify this in the <code>listAttributes()</code> function. <em>The set of attributes is still quite long, so we use <code>head()</code> to show only the first few items here.</em></p>
<pre class="r"><code>head(listAttributes(ensembl, page="feature_page"))</code></pre>
-<pre><code>## name description page
-## 1 ensembl_gene_id Gene stable ID feature_page
-## 2 ensembl_transcript_id Transcript stable ID feature_page
-## 3 ensembl_peptide_id Protein stable ID feature_page
-## 4 ensembl_exon_id Exon stable ID feature_page
-## 5 description Gene description feature_page
-## 6 chromosome_name Chromosome/scaffold name feature_page</code></pre>
+<pre><code>## name description page
+## 1 ensembl_gene_id Gene stable ID feature_page
+## 2 ensembl_gene_id_version Gene stable ID version feature_page
+## 3 ensembl_transcript_id Transcript stable ID feature_page
+## 4 ensembl_transcript_id_version Transcript stable ID version feature_page
+## 5 ensembl_peptide_id Protein stable ID feature_page
+## 6 ensembl_peptide_id_version Protein stable ID version feature_page</code></pre>
<p>We now get a short list of attributes related to the region where the genes are located.</p>
</div>
</div>
@@ -677,13 +798,13 @@ select(mart, keys=affy, columns=c('affy_hg_u133_plus_2','entrezgene'),
<div id="session-info" class="section level1">
<h1><span class="header-section-number">10</span> Session Info</h1>
<pre class="r"><code>sessionInfo()</code></pre>
-<pre><code>## R version 3.4.0 (2017-04-21)
+<pre><code>## R version 3.4.2 (2017-09-28)
## Platform: x86_64-pc-linux-gnu (64-bit)
-## Running under: Ubuntu 16.04.2 LTS
+## Running under: Ubuntu 16.04.3 LTS
##
## Matrix products: default
-## BLAS: /home/biocbuild/bbs-3.5-bioc/R/lib/libRblas.so
-## LAPACK: /home/biocbuild/bbs-3.5-bioc/R/lib/libRlapack.so
+## BLAS: /home/biocbuild/bbs-3.6-bioc/R/lib/libRblas.so
+## LAPACK: /home/biocbuild/bbs-3.6-bioc/R/lib/libRlapack.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=C
@@ -694,21 +815,52 @@ select(mart, keys=affy, columns=c('affy_hg_u133_plus_2','entrezgene'),
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
-## [1] biomaRt_2.32.1 BiocStyle_2.4.0
+## [1] biomaRt_2.34.0 BiocStyle_2.6.0
##
## loaded via a namespace (and not attached):
-## [1] Rcpp_0.12.11 AnnotationDbi_1.38.1 knitr_1.16 magrittr_1.5 IRanges_2.10.2
-## [6] BiocGenerics_0.22.0 stringr_1.2.0 tools_3.4.0 parallel_3.4.0 Biobase_2.36.2
-## [11] DBI_0.6-1 htmltools_0.3.6 yaml_2.1.14 rprojroot_1.2 digest_0.6.12
-## [16] S4Vectors_0.14.3 bitops_1.0-6 RCurl_1.95-4.8 memoise_1.1.0 evaluate_0.10
-## [21] RSQLite_1.1-2 rmarkdown_1.5 stringi_1.1.5 compiler_3.4.0 backports_1.1.0
-## [26] stats4_3.4.0 XML_3.98-1.7</code></pre>
+## [1] Rcpp_0.12.13 AnnotationDbi_1.40.0 knitr_1.17 magrittr_1.5 progress_1.1.2
+## [6] IRanges_2.12.0 BiocGenerics_0.24.0 bit_1.1-12 R6_2.2.2 rlang_0.1.2
+## [11] stringr_1.2.0 blob_1.1.0 tools_3.4.2 parallel_3.4.2 Biobase_2.38.0
+## [16] DBI_0.7 htmltools_0.3.6 assertthat_0.2.0 yaml_2.1.14 bit64_0.9-7
+## [21] rprojroot_1.2 digest_0.6.12 tibble_1.3.4 bookdown_0.5 S4Vectors_0.16.0
+## [26] bitops_1.0-6 RCurl_1.95-4.8 memoise_1.1.0 evaluate_0.10.1 RSQLite_2.0
+## [31] rmarkdown_1.6 stringi_1.1.5 compiler_3.4.2 prettyunits_1.0.2 backports_1.1.1
+## [36] stats4_3.4.2 XML_3.98-1.9</code></pre>
<pre class="r"><code>warnings()</code></pre>
<pre><code>## NULL</code></pre>
</div>
+
+</div>
+
+<script>
+
+// add bootstrap table styles to pandoc tables
+function bootstrapStylePandocTables() {
+ $('tr.header').parent('thead').parent('table').addClass('table table-condensed');
+}
+$(document).ready(function () {
+ bootstrapStylePandocTables();
+});
+
+
+</script>
+
+<script type="text/x-mathjax-config">
+ MathJax.Hub.Config({
+ "HTML-CSS": {
+ styles: {
+ ".MathJax_Display": {
+ "text-align": "center",
+ padding: "0px 150px 0px 65px",
+ margin: "0px 0px 0.5em"
+ },
+ }
+ }
+ });
+</script>
<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
(function () {
diff --git a/man/listEnsemblArchives.Rd b/man/listEnsemblArchives.Rd
new file mode 100644
index 0000000..3a2901d
--- /dev/null
+++ b/man/listEnsemblArchives.Rd
@@ -0,0 +1,18 @@
+\name{listEnsemblArchives}
+\alias{listEnsemblArchives}
+\title{Lists the available archived versions of Ensembl}
+\description{Returns a table containing the available archived versions of
+Ensembl, along with the dates they were created and the URL used to access
+them.}
+
+\usage{listEnsemblArchives()}
+
+\arguments{}
+
+\author{Mike Smith}
+
+\examples{
+listEnsemblArchives()
+}
+\keyword{methods}
+
diff --git a/man/listMarts.Rd b/man/listMarts.Rd
index b1a2fe7..709cc38 100644
--- a/man/listMarts.Rd
+++ b/man/listMarts.Rd
@@ -14,7 +14,11 @@ marts there are to connect to.}
\item{path}{path to martservice that should be pasted behind the host to get to web service URL}
\item{port}{port to use in HTTP communication}
\item{includeHosts}{boolean to indicate if function should return host of the BioMart databases}
-\item{archive}{Boolean to indicate if you want to access archived versions of BioMart database}
+\item{archive}{Boolean to indicate if you want to access archived versions of
+BioMart database. Note that this argument is now deprecated and will be removed
+in the future. A better alternative is to specify the url of the archived
+BioMart you want to access. For Ensembl you can view the list of archives
+using \code{\link{listEnsemblArchives}}}
\item{ssl.verifypeer}{Set SSL peer verification on or off. By default ssl.verifypeer is set to TRUE}
\item{ensemblRedirect}{By default when you access Ensembl BioMart it will
redirect you to your local mirror, even if you have set a region specific
diff --git a/man/useMart.Rd b/man/useMart.Rd
index ac982c6..4397539 100644
--- a/man/useMart.Rd
+++ b/man/useMart.Rd
@@ -12,7 +12,7 @@ TRUE, ensemblRedirect = TRUE, version, verbose = FALSE)}
\item{host}{Host to connect to. Defaults to \code{www.ensembl.org}}
\item{path}{Path that should be pasted after to host to get access to the web service URL}
\item{port}{port to connect to, will be pasted between host and path}
-\item{archive}{Boolean to indicate if you want to access archived versions of BioMart databases. Note that this gives access to only a limited number of archived BioMarts and the most recent archives are often not available. A better alternative is to leave archive = FALSE and to specify the url of the archived BioMart you want to access see vignette for an example.}
+\item{archive}{Boolean to indicate if you want to access archived versions of BioMart databases. Note that this argument is now deprecated and will be removed in the future. A better alternative is to leave archive = FALSE and to specify the url of the archived BioMart you want to access. For Ensembl you can view the list of archives using \code{\link{listEnsemblArchives}}}
\item{ssl.verifypeer}{Set SSL peer verification on or off. By default ssl.verifypeer is set to TRUE}
\item{ensemblRedirect}{By default when you access Ensembl BioMart it will
redirect you to your local mirror, even if you have set a region specific
diff --git a/tests/testthat/test_hostProcessing.R b/tests/testthat/test_hostProcessing.R
new file mode 100644
index 0000000..43f069d
--- /dev/null
+++ b/tests/testthat/test_hostProcessing.R
@@ -0,0 +1,16 @@
+library(biomaRt)
+
+## adding http if needed
+host <- 'www.myurl.org'
+expect_equal(object = .cleanHostURL(host = host),
+ expected = "http://www.myurl.org")
+
+## stripping trailing slash
+host <- 'http://www.myurl.org/'
+expect_equal(object = .cleanHostURL(host = host),
+ expected = "http://www.myurl.org")
+
+## leave https already there
+host <- 'https://www.myurl.org'
+expect_equal(object = .cleanHostURL(host = host),
+ expected = "https://www.myurl.org")
diff --git a/tests/testthat/test_useMart.R b/tests/testthat/test_useMart.R
new file mode 100644
index 0000000..3ce0fd4
--- /dev/null
+++ b/tests/testthat/test_useMart.R
@@ -0,0 +1,13 @@
+library(biomaRt)
+
+## checking the show() method
+ensembl <- useMart("ensembl")
+ensembl_with_dataset <- useDataset(ensembl,
+ dataset = "xtropicalis_gene_ensembl")
+
+test_that("Show give sensible dataset information", {
+ expect_output(object = show(ensembl),
+ regexp = "No dataset selected")
+ expect_output(object = show(ensembl_with_dataset),
+ regexp = "Using the xtropicalis_gene_ensembl dataset")
+})
diff --git a/vignettes/biomaRt.Rmd b/vignettes/biomaRt.Rmd
index ed3b60f..c404e64 100644
--- a/vignettes/biomaRt.Rmd
+++ b/vignettes/biomaRt.Rmd
@@ -326,14 +326,24 @@ After you selected the BioMart database and dataset, queries can be performed in
## Accessing archives through specifying the archive host
-->
-Use the <http://www.ensembl.org> website and go down the bottom of the page. Click on 'view in Archive' and select the archive you need. Copy the url and use that url as shown below to connect to the specified BioMart database. The example below shows how to query Ensembl 54.
+`r Biocpkg("biomaRt")` provides the function `listEnsemblArchives()` to view the available archives. This function takes no arguments, and produces a table containing the names of the available archived versions, the date they were first available, and the URL where they can be accessed.
+
+```{r archiveMarts, echo = TRUE, eval = TRUE}
+listEnsemblArchives()
+```
+
+Alternatively, one can use the <http://www.ensembl.org> website to find archived version. From the main page scroll down the bottom of the page, click on 'view in Archive' and select the archive you need.
+
+*You will notice that there is an archive URL even for the current release of Ensembl. It can be useful to use this if you wish to ensure that script you write now will return exactly the same results in the future. Using `www.ensembl.org` will always access the current release, and so the data retrieved may change over time as new releases come out.*
+
+Whichever method you use to find the URL of the archive you wish to query, copy the url and use that in the `host` argument as shown below to connect to the specified BioMart database. The example below shows how to query Ensembl 54.
```{r archiveMarts3, echo = TRUE, eval = TRUE}
-listMarts(host='may2009.archive.ensembl.org')
-ensembl54=useMart(host='may2009.archive.ensembl.org',
- biomart='ENSEMBL_MART_ENSEMBL',
- dataset='hsapiens_gene_ensembl')
+listMarts(host = 'may2009.archive.ensembl.org')
+ensembl54 <- useMart(host='may2009.archive.ensembl.org',
+ biomart='ENSEMBL_MART_ENSEMBL',
+ dataset='hsapiens_gene_ensembl')
```
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/r-bioc-biomart.git
More information about the debian-med-commit
mailing list