[med-svn] [r-bioc-genomeinfodbdata] 02/04: New upstream version 0.99.0
Andreas Tille
tille at debian.org
Fri Sep 29 11:15:51 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository r-bioc-genomeinfodbdata.
commit c445368d87ffbd9c2b6811376f31c443aaf596c2
Author: Andreas Tille <tille at debian.org>
Date: Fri Sep 29 13:13:20 2017 +0200
New upstream version 0.99.0
data/specData.rda | Bin 0 -> 7044532 bytes
data/speciesMap.rda | Bin 0 -> 8772520 bytes
data/validTaxIds.rda | Bin 0 -> 416192 bytes
debian/README.source | 16 -----
debian/changelog | 5 --
debian/compat | 1 -
debian/control | 20 -------
debian/copyright | 106 ----------------------------------
debian/rules | 4 --
debian/source/format | 1 -
debian/watch | 3 -
inst/scripts/updateGenomeInfoDbData.R | 65 +++++++++++++++++++++
man/GenomeInfoDbData-package.Rd | 43 ++++++++++++++
15 files changed, 119 insertions(+), 156 deletions(-)
new file mode 100644
index 0000000..b12ffd9
--- /dev/null
@@ -0,0 +1,10 @@
+Package: GenomeInfoDbData
+Title: Species and taxonomy ID look up tables used by GenomeInfoDb
+Description: Files for mapping between NCBI taxonomy ID and species. Used
+ by functions in the GenomeInfoDb package.
+Version: 0.99.0
+Author: Bioconductor Core Team
+Maintainer: Bioconductor Maintainer <maintainer at bioconductor.org>
+Depends: R (>= 3.3)
+biocViews: AnnotationData, Organism
+License: Artistic-2.0
new file mode 100644
index 0000000..8b13789
--- /dev/null
@@ -0,0 +1 @@
diff --git a/data/specData.rda b/data/specData.rda
new file mode 100644
index 0000000..69b9848
Binary files /dev/null and b/data/specData.rda differ
diff --git a/data/speciesMap.rda b/data/speciesMap.rda
new file mode 100644
index 0000000..86807c2
Binary files /dev/null and b/data/speciesMap.rda differ
diff --git a/data/validTaxIds.rda b/data/validTaxIds.rda
new file mode 100644
index 0000000..5a2866b
Binary files /dev/null and b/data/validTaxIds.rda differ
diff --git a/debian/README.source b/debian/README.source
deleted file mode 100644
index 4ad890c..0000000
--- a/debian/README.source
+++ /dev/null
@@ -1,16 +0,0 @@
-This package contains three mapping objects:
-* data/speciesMap.rda: A data frame with columns 'tax_id', 'genus', and
- 'species'. Used to retrieve taxonomy ID by species and returns list of
- available species.
-* data/validTaxIds.rda: An integer vector of valid taxonomy IDs created
- from 'speciesMap'. Used internally for quick taxonomy ID look ups.
-* data/specData.rds: A data frame with columns 'taxon' and 'species'.
- Used internally to retrieve species by taxonomy ID.
-Scripts to generate these files are in inst/scripts.
-All originate from the public taxonomy dump at
diff --git a/debian/changelog b/debian/changelog
deleted file mode 100644
index 17bd319..0000000
--- a/debian/changelog
+++ /dev/null
@@ -1,5 +0,0 @@
-r-bioc-genomeinfodbdata (0.99.0-1) unstable; urgency=medium
- * Initial release (Closes: #862550)
- -- Graham Inggs <ginggs at debian.org> Mon, 15 May 2017 16:12:39 +0200
diff --git a/debian/compat b/debian/compat
deleted file mode 100644
index f599e28..0000000
--- a/debian/compat
+++ /dev/null
@@ -1 +0,0 @@
diff --git a/debian/control b/debian/control
deleted file mode 100644
index 032a012..0000000
--- a/debian/control
+++ /dev/null
@@ -1,20 +0,0 @@
-Source: r-bioc-genomeinfodbdata
-Section: gnu-r
-Priority: optional
-Maintainer: Debian Med Packaging Team <debian-med-packaging at lists.alioth.debian.org>
-Uploaders: Graham Inggs <ginggs at debian.org>
-Build-Depends: debhelper (>= 10), dh-r, r-base-dev
-Standards-Version: 3.9.8
-Homepage: https://bioconductor.org/packages/GenomeInfoDbData/
-Vcs-Browser: https://anonscm.debian.org/viewvc/debian-med/trunk/packages/R/r-bioc-genomeinfodbdata/trunk/
-Vcs-Svn: svn://anonscm.debian.org/debian-med/trunk/packages/R/r-bioc-genomeinfodbdata/trunk/
-Package: r-bioc-genomeinfodbdata
-Architecture: all
-Depends: ${R:Depends}, ${misc:Depends}, ${shlibs:Depends}
-Recommends: ${R:Recommends}
-Suggests: ${R:Suggests}
-Description: BioConductor species and taxonomy ID look up tables
- This package contains files for mapping between NCBI taxonomy ID and species.
- .
- It is used by functions in the r-bioc-genomeinfodb package.
diff --git a/debian/copyright b/debian/copyright
deleted file mode 100644
index b7e6571..0000000
--- a/debian/copyright
+++ /dev/null
@@ -1,106 +0,0 @@
-Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
-Upstream-Name: GenomeInfoDbData
-Upstream-Contact: Bioconductor Maintainer <maintainer at bioconductor.org>
-Source: https://bioconductor.org/packages/GenomeInfoDbData/
-Files: *
-Copyright: 2006-2017 Sonali Arora, Martin Morgan, Marc Carlson, H. Pagès
-License: Artistic-2.0
-Files: debian/*
-Copyright: 2017 Graham Inggs <ginggs at debian.org>
-License: Artistic-2.0
-License: Artistic-2.0
- The "Artistic License"
- .
- Preamble
- .
- 1. You may make and give away verbatim copies of the source form of the
- Standard Version of this Package without restriction, provided that
- you duplicate all of the original copyright notices and associated
- disclaimers.
- .
- 2. You may apply bug fixes, portability fixes and other modifications
- derived from the Public Domain or from the Copyright Holder. A
- Package modified in such a way shall still be considered the Standard
- Version.
- .
- 3. You may otherwise modify your copy of this Package in any way,
- provided that you insert a prominent notice in each changed file stating
- how and when you changed that file, and provided that you do at least
- ONE of the following:
- .
- a) place your modifications in the Public Domain or otherwise make them
- Freely Available, such as by posting said modifications to Usenet or
- an equivalent medium, or placing the modifications on a major archive
- site such as uunet.uu.net, or by allowing the Copyright Holder to include
- your modifications in the Standard Version of the Package.
- .
- b) use the modified Package only within your corporation or organization.
- .
- c) rename any non-standard executables so the names do not conflict
- with standard executables, which must also be provided, and provide
- a separate manual page for each non-standard executable that clearly
- documents how it differs from the Standard Version.
- .
- d) make other distribution arrangements with the Copyright Holder.
- .
- 4. You may distribute the programs of this Package in object code or
- executable form, provided that you do at least ONE of the following:
- .
- a) distribute a Standard Version of the executables and library files,
- together with instructions (in the manual page or equivalent) on where
- to get the Standard Version.
- .
- b) accompany the distribution with the machine-readable source of
- the Package with your modifications.
- .
- c) give non-standard executables non-standard names, and clearly
- document the differences in manual pages (or equivalent), together
- with instructions on where to get the Standard Version.
- .
- d) make other distribution arrangements with the Copyright Holder.
- .
- 5. You may charge a reasonable copying fee for any distribution of this
- Package. You may charge any fee you choose for support of this Package.
- You may not charge a fee for this Package itself. However, you may
- distribute this Package in aggregate with other (possibly commercial)
- programs as part of a larger (possibly commercial) software distribution
- provided that you do not advertise this Package as a product of your
- own. You may embed this Package's interpreter within an executable of
- yours (by linking); this shall be construed as a mere form of
- aggregation, provided that the complete Standard Version of the
- interpreter is so embedded.
- .
- 6. The scripts and library files supplied as input to or produced as
- output from the programs of this Package do not automatically fall under
- the copyright of this Package, but belong to whoever generated them, and
- may be sold commercially, and may be aggregated with this Package. If
- such scripts or library files are aggregated with this Package via the
- so-called "undump" or "unexec" methods of producing a binary executable
- image, then distribution of such an image shall neither be construed as
- a distribution of this Package nor shall it fall under the restrictions
- of Paragraphs 3 and 4, provided that you do not represent such an
- executable image as a Standard Version of this Package.
- .
- 7. C subroutines (or comparably compiled subroutines in other
- languages) supplied by you and linked into this Package in order to
- emulate subroutines and variables of the language defined by this
- Package shall not be considered part of this Package, but are the
- equivalent of input as in Paragraph 6, provided these subroutines do
- not change the language in any way that would cause it to fail the
- regression tests for the language.
- .
- 8. Aggregation of this Package with a commercial distribution is always
- permitted provided that the use of this Package is embedded; that is,
- when no overt attempt is made to make this Package's interfaces visible
- to the end user of the commercial distribution. Such use shall not be
- construed as a distribution of this Package.
- .
- 9. The name of the Copyright Holder may not be used to endorse or promote
- products derived from this software without specific prior written permission.
- .
diff --git a/debian/rules b/debian/rules
deleted file mode 100755
index 68d9a36..0000000
--- a/debian/rules
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/usr/bin/make -f
- dh $@ --buildsystem R
diff --git a/debian/source/format b/debian/source/format
deleted file mode 100644
index 163aaf8..0000000
--- a/debian/source/format
+++ /dev/null
@@ -1 +0,0 @@
-3.0 (quilt)
diff --git a/debian/watch b/debian/watch
deleted file mode 100644
index 9248da7..0000000
--- a/debian/watch
+++ /dev/null
@@ -1,3 +0,0 @@
-opts=downloadurlmangle=s?^(.*)\.\.?https:$1packages/release/data/annotation? \
-http://www.bioconductor.org/packages/release/data/annotation/html/GenomeInfoDbData.html .*/GenomeInfoDbData_(.*).tar.gz
diff --git a/inst/scripts/updateGenomeInfoDbData.R b/inst/scripts/updateGenomeInfoDbData.R
new file mode 100644
index 0000000..8cfcca8
--- /dev/null
+++ b/inst/scripts/updateGenomeInfoDbData.R
@@ -0,0 +1,65 @@
+## Scripts for updating specData, speciesMap and validTaxId
+## Download and unpack mapping file:
+## ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
+## Generates specData
+.processTaxNamesFile <- function(filesDir=getwd()){
+## species <- read.delim('names.dmp',header = FALSE,sep = "|")
+ dest <- file.path(filesDir, "names.dmp")
+ data <- read.delim(dest, header=FALSE, sep="\t", quote="",
+ stringsAsFactors=FALSE)
+ species <- data[,seq(1, dim(data)[2], by=2)] ## Throw away 'pipe columns'
+ colnames(species) <- c('tax_id','name_txt','unique_name','name_class')
+ ## keep only some cols
+ species <- species[,c(1:2,4)]
+ ## throw away tabs from second col
+ species[[2]] <- gsub('\t','',species[[2]])
+ ## And the third col
+ species[[3]] <- gsub('\t','',species[[3]])
+ ## throw away rows where the third column doesn't say 'scientific name'
+ keep <- grepl('scientific name', species[[3]])
+ species <- species[keep,1:2]
+ ## split second column by first space:
+ rawSpec <- species[[2]]
+ spltSpec <- strsplit(rawSpec, split=" ")
+ genusDat <- unlist(lapply(spltSpec, function(x){x[1]}))
+ .getRest <- function(x){
+ if(length(x) > 1){
+ return(paste(x[2:length(x)], collapse=" "))
+ }else{
+ return(NA)
+ }
+ }
+ speciesDat <- unlist(lapply(spltSpec, .getRest))
+ specData <- data.frame(tax_id=as.integer(species[[1]]), ## integer
+ genus=as.factor(genusDat), ## factor
+ species=speciesDat, ## character
+ stringsAsFactors=FALSE)
+ save(specData, file='specData.rda', compress="xz")
+## Generates speciesMap and validTaxIds
+.processSpeciesMapData <- function(){
+ con <- file('names.dmp')
+ species <- readLines(con)
+ close(con)
+ splt <- strsplit(species, split='\\t\\|\\t')
+ ## Throw away elements where column 4 is not 'scientific name' or 'synonym'
+ idx1 <- unlist(lapply(splt, function(x){grepl('scientific name', x[4])}))
+ idx2 <- unlist(lapply(splt, function(x){grepl('synonym', x[4])}))
+ idx <- idx1 | idx2
+ splt <- splt[idx]
+ ## and keep only 1st two elements
+ taxon <- as.integer(unlist(lapply(splt, function(x){x[1]})))
+ species <- unlist(lapply(splt, function(x){x[2]}))
+ speciesMap <- data.frame(taxon, ## integer
+ species, ## character
+ stringsAsFactors=FALSE)
+ save(speciesMap, file='speciesMap.rda', compress="xz")
+ ## Then get the valid Tax IDs.
+ validTaxIds <- unique(speciesMap$taxon) ## integer
+ save(validTaxIds, file='validTaxIds.rda', compress="xz")
diff --git a/man/GenomeInfoDbData-package.Rd b/man/GenomeInfoDbData-package.Rd
new file mode 100644
index 0000000..c80a8a5
--- /dev/null
+++ b/man/GenomeInfoDbData-package.Rd
@@ -0,0 +1,43 @@
+\title{Species and taxonomy ID look up tables}
+ This package contains three mapping objects:
+ \itemize{
+ \item speciesMap: A data frame with columns \sQuote{tax_id},
+ \sQuote{genus}, and \sQuote{species}. Used to retrieve taxonomy
+ ID by species and returns list of available species.
+ \item validTaxIds: An integer vector of valid taxonomy IDs created from
+ \code{speciesMap}. Used internally for quick taxonomy ID look ups.
+ \item specData: A data frame with columns \sQuote{taxon} and
+ \sQuote{species}. Used internally to retrieve species by taxonomy ID.
+ }
+ Scripts to generate these files are in GenomeInfoDbData/inst/scripts. All
+ originate from the taxdummp download at
+ \url{ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz}.
+sapply(speciesMap, class) # taxon species
+ # "integer" "character"
+subset(speciesMap, species=="Homo sapiens")$taxon # [1] 9606
+\author{Bioconductor Core Team}
