[med-svn] [r-bioc-genomeinfodbdata] 02/04: New upstream version 0.99.0

Fri Sep 29 11:15:51 UTC 2017

This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository r-bioc-genomeinfodbdata.

commit c445368d87ffbd9c2b6811376f31c443aaf596c2
Author: Andreas Tille <tille at debian.org>
Date:   Fri Sep 29 13:13:20 2017 +0200

    New upstream version 0.99.0
---
 DESCRIPTION                           |  10 ++++
 NAMESPACE                             |   1 +
 data/specData.rda                     | Bin 0 -> 7044532 bytes
 data/speciesMap.rda                   | Bin 0 -> 8772520 bytes
 data/validTaxIds.rda                  | Bin 0 -> 416192 bytes
 debian/README.source                  |  16 -----
 debian/changelog                      |   5 --
 debian/compat                         |   1 -
 debian/control                        |  20 -------
 debian/copyright                      | 106 ----------------------------------
 debian/rules                          |   4 --
 debian/source/format                  |   1 -
 debian/watch                          |   3 -
 inst/scripts/updateGenomeInfoDbData.R |  65 +++++++++++++++++++++
 man/GenomeInfoDbData-package.Rd       |  43 ++++++++++++++
 15 files changed, 119 insertions(+), 156 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
new file mode 100644
index 0000000..b12ffd9
--- /dev/null
+++ b/DESCRIPTION
@@ -0,0 +1,10 @@
+Package: GenomeInfoDbData 
+Title: Species and taxonomy ID look up tables used by GenomeInfoDb 
+Description: Files for mapping between NCBI taxonomy ID and species. Used
+        by functions in the GenomeInfoDb package.
+Version: 0.99.0
+Author: Bioconductor Core Team 
+Maintainer: Bioconductor Maintainer <maintainer at bioconductor.org>
+Depends: R (>= 3.3)
+biocViews: AnnotationData, Organism
+License: Artistic-2.0
diff --git a/NAMESPACE b/NAMESPACE
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/NAMESPACE
@@ -0,0 +1 @@
+
diff --git a/data/specData.rda b/data/specData.rda
new file mode 100644
index 0000000..69b9848
Binary files /dev/null and b/data/specData.rda differ
diff --git a/data/speciesMap.rda b/data/speciesMap.rda
new file mode 100644
index 0000000..86807c2
Binary files /dev/null and b/data/speciesMap.rda differ
diff --git a/data/validTaxIds.rda b/data/validTaxIds.rda
new file mode 100644
index 0000000..5a2866b
Binary files /dev/null and b/data/validTaxIds.rda differ
diff --git a/debian/README.source b/debian/README.source
deleted file mode 100644
index 4ad890c..0000000
--- a/debian/README.source
+++ /dev/null
@@ -1,16 +0,0 @@
-This package contains three mapping objects:
-
-* data/speciesMap.rda: A data frame with columns 'tax_id', 'genus', and
-  'species'.  Used to retrieve taxonomy ID by species and returns list of
-  available species.
-
-* data/validTaxIds.rda: An integer vector of valid taxonomy IDs created
-  from 'speciesMap'.  Used internally for quick taxonomy ID look ups.
-
-* data/specData.rds: A data frame with columns 'taxon' and 'species'.
-  Used internally to retrieve species by taxonomy ID.
-
-
-Scripts to generate these files are in inst/scripts.
-All originate from the public taxonomy dump at
-ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
diff --git a/debian/changelog b/debian/changelog
deleted file mode 100644
index 17bd319..0000000
--- a/debian/changelog
+++ /dev/null
@@ -1,5 +0,0 @@
-r-bioc-genomeinfodbdata (0.99.0-1) unstable; urgency=medium
-
-  * Initial release (Closes: #862550)
-
- -- Graham Inggs <ginggs at debian.org>  Mon, 15 May 2017 16:12:39 +0200
diff --git a/debian/compat b/debian/compat
deleted file mode 100644
index f599e28..0000000
--- a/debian/compat
+++ /dev/null
@@ -1 +0,0 @@
-10
diff --git a/debian/control b/debian/control
deleted file mode 100644
index 032a012..0000000
--- a/debian/control
+++ /dev/null
@@ -1,20 +0,0 @@
-Source: r-bioc-genomeinfodbdata
-Section: gnu-r
-Priority: optional
-Maintainer: Debian Med Packaging Team <debian-med-packaging at lists.alioth.debian.org>
-Uploaders: Graham Inggs <ginggs at debian.org>
-Build-Depends: debhelper (>= 10), dh-r, r-base-dev
-Standards-Version: 3.9.8
-Homepage: https://bioconductor.org/packages/GenomeInfoDbData/
-Vcs-Browser: https://anonscm.debian.org/viewvc/debian-med/trunk/packages/R/r-bioc-genomeinfodbdata/trunk/
-Vcs-Svn: svn://anonscm.debian.org/debian-med/trunk/packages/R/r-bioc-genomeinfodbdata/trunk/
-
-Package: r-bioc-genomeinfodbdata
-Architecture: all
-Depends: ${R:Depends}, ${misc:Depends}, ${shlibs:Depends}
-Recommends: ${R:Recommends}
-Suggests: ${R:Suggests}
-Description: BioConductor species and taxonomy ID look up tables
- This package contains files for mapping between NCBI taxonomy ID and species.
- .
- It is used by functions in the r-bioc-genomeinfodb package.
diff --git a/debian/copyright b/debian/copyright
deleted file mode 100644
index b7e6571..0000000
--- a/debian/copyright
+++ /dev/null
@@ -1,106 +0,0 @@
-Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
-Upstream-Name: GenomeInfoDbData
-Upstream-Contact: Bioconductor Maintainer <maintainer at bioconductor.org>
-Source: https://bioconductor.org/packages/GenomeInfoDbData/
-
-Files: *
-Copyright: 2006-2017 Sonali Arora, Martin Morgan, Marc Carlson, H. Pagès
-License: Artistic-2.0
-
-Files: debian/*
-Copyright: 2017 Graham Inggs <ginggs at debian.org>
-License: Artistic-2.0
-
-License: Artistic-2.0
-			 The "Artistic License"
- .
-				Preamble
- .
- 1. You may make and give away verbatim copies of the source form of the
-    Standard Version of this Package without restriction, provided that
-    you duplicate all of the original copyright notices and associated
-    disclaimers.
- .
- 2. You may apply bug fixes, portability fixes and other modifications
-    derived from the Public Domain or from the Copyright Holder.  A
-    Package modified in such a way shall still be considered the Standard
-    Version.
- .
- 3. You may otherwise modify your copy of this Package in any way,
-    provided that you insert a prominent notice in each changed file stating
-    how and when you changed that file, and provided that you do at least
-    ONE of the following:
- .
-    a) place your modifications in the Public Domain or otherwise make them
-    Freely Available, such as by posting said modifications to Usenet or
-    an equivalent medium, or placing the modifications on a major archive
-    site such as uunet.uu.net, or by allowing the Copyright Holder to include
-    your modifications in the Standard Version of the Package.
- .
-    b) use the modified Package only within your corporation or organization.
- .
-    c) rename any non-standard executables so the names do not conflict
-    with standard executables, which must also be provided, and provide
-    a separate manual page for each non-standard executable that clearly
-    documents how it differs from the Standard Version.
- .
-    d) make other distribution arrangements with the Copyright Holder.
- .
- 4. You may distribute the programs of this Package in object code or
-    executable form, provided that you do at least ONE of the following:
- .
-    a) distribute a Standard Version of the executables and library files,
-    together with instructions (in the manual page or equivalent) on where
-    to get the Standard Version.
- .
-    b) accompany the distribution with the machine-readable source of
-    the Package with your modifications.
- .
-    c) give non-standard executables non-standard names, and clearly
-    document the differences in manual pages (or equivalent), together
-    with instructions on where to get the Standard Version.
- .
-    d) make other distribution arrangements with the Copyright Holder.
- .
- 5. You may charge a reasonable copying fee for any distribution of this
-    Package.  You may charge any fee you choose for support of this Package.
-    You may not charge a fee for this Package itself.  However, you may
-    distribute this Package in aggregate with other (possibly commercial)
-    programs as part of a larger (possibly commercial) software distribution
-    provided that you do not advertise this Package as a product of your
-    own.  You may embed this Package's interpreter within an executable of
-    yours (by linking); this shall be construed as a mere form of
-    aggregation, provided that the complete Standard Version of the
-    interpreter is so embedded.
- .
- 6. The scripts and library files supplied as input to or produced as
-    output from the programs of this Package do not automatically fall under
-    the copyright of this Package, but belong to whoever generated them, and
-    may be sold commercially, and may be aggregated with this Package.  If
-    such scripts or library files are aggregated with this Package via the
-    so-called "undump" or "unexec" methods of producing a binary executable
-    image, then distribution of such an image shall neither be construed as
-    a distribution of this Package nor shall it fall under the restrictions
-    of Paragraphs 3 and 4, provided that you do not represent such an
-    executable image as a Standard Version of this Package.
- .
- 7. C subroutines (or comparably compiled subroutines in other
-    languages) supplied by you and linked into this Package in order to
-    emulate subroutines and variables of the language defined by this
-    Package shall not be considered part of this Package, but are the
-    equivalent of input as in Paragraph 6, provided these subroutines do
-    not change the language in any way that would cause it to fail the
-    regression tests for the language.
- .
- 8. Aggregation of this Package with a commercial distribution is always
-    permitted provided that the use of this Package is embedded; that is,
-    when no overt attempt is made to make this Package's interfaces visible
-    to the end user of the commercial distribution.  Such use shall not be
-    construed as a distribution of this Package.
- .
- 9. The name of the Copyright Holder may not be used to endorse or promote
-    products derived from this software without specific prior written permission.
- .
- 10. THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR
-    IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
-    WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
diff --git a/debian/rules b/debian/rules
deleted file mode 100755
index 68d9a36..0000000
--- a/debian/rules
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/usr/bin/make -f
-
-%:
-	dh $@ --buildsystem R
diff --git a/debian/source/format b/debian/source/format
deleted file mode 100644
index 163aaf8..0000000
--- a/debian/source/format
+++ /dev/null
@@ -1 +0,0 @@
-3.0 (quilt)
diff --git a/debian/watch b/debian/watch
deleted file mode 100644
index 9248da7..0000000
--- a/debian/watch
+++ /dev/null
@@ -1,3 +0,0 @@
-version=4
-opts=downloadurlmangle=s?^(.*)\.\.?https:$1packages/release/data/annotation? \
-http://www.bioconductor.org/packages/release/data/annotation/html/GenomeInfoDbData.html .*/GenomeInfoDbData_(.*).tar.gz
diff --git a/inst/scripts/updateGenomeInfoDbData.R b/inst/scripts/updateGenomeInfoDbData.R
new file mode 100644
index 0000000..8cfcca8
--- /dev/null
+++ b/inst/scripts/updateGenomeInfoDbData.R
@@ -0,0 +1,65 @@
+## Scripts for updating specData, speciesMap and validTaxId
+
+## Download and unpack mapping file: 
+## ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
+
+## Generates specData
+.processTaxNamesFile <- function(filesDir=getwd()){
+##    species <- read.delim('names.dmp',header = FALSE,sep = "|")
+    dest  <- file.path(filesDir, "names.dmp")
+    data <-  read.delim(dest, header=FALSE, sep="\t", quote="",
+                        stringsAsFactors=FALSE)
+    species <- data[,seq(1, dim(data)[2], by=2)] ## Throw away 'pipe columns'
+    colnames(species) <- c('tax_id','name_txt','unique_name','name_class')
+    ## keep only some cols
+    species <- species[,c(1:2,4)]
+    ## throw away tabs from second col
+    species[[2]] <- gsub('\t','',species[[2]])
+    ## And the third col
+    species[[3]] <- gsub('\t','',species[[3]])
+    ## throw away rows where the third column doesn't say 'scientific name'
+    keep <- grepl('scientific name', species[[3]])
+    species <- species[keep,1:2]
+ 
+    ## split second column by first space:
+    rawSpec <- species[[2]]
+    spltSpec <- strsplit(rawSpec, split=" ")
+    genusDat <- unlist(lapply(spltSpec, function(x){x[1]}))
+    .getRest <- function(x){
+        if(length(x) > 1){
+            return(paste(x[2:length(x)], collapse=" "))
+        }else{
+            return(NA)
+        }
+    }
+    speciesDat <- unlist(lapply(spltSpec, .getRest))
+    specData <- data.frame(tax_id=as.integer(species[[1]]), ## integer
+                           genus=as.factor(genusDat),       ## factor
+                           species=speciesDat,              ## character
+                           stringsAsFactors=FALSE)
+    save(specData, file='specData.rda', compress="xz")
+}
+
+## Generates speciesMap and validTaxIds
+.processSpeciesMapData <- function(){
+    con <- file('names.dmp')
+    species <- readLines(con)
+    close(con)
+    splt <- strsplit(species, split='\\t\\|\\t')
+    ## Throw away elements where column 4 is not 'scientific name' or 'synonym'
+    idx1 <- unlist(lapply(splt, function(x){grepl('scientific name', x[4])}))
+    idx2 <- unlist(lapply(splt, function(x){grepl('synonym', x[4])}))
+    idx <- idx1 | idx2
+    splt <- splt[idx]
+    ## and keep only 1st two elements
+    taxon <-  as.integer(unlist(lapply(splt, function(x){x[1]})))
+    species <- unlist(lapply(splt, function(x){x[2]})) 
+    speciesMap <- data.frame(taxon,    ## integer
+                             species,  ## character 
+                             stringsAsFactors=FALSE)
+    save(speciesMap, file='speciesMap.rda', compress="xz")
+
+    ## Then get the valid Tax IDs.
+    validTaxIds <- unique(speciesMap$taxon)  ## integer
+    save(validTaxIds, file='validTaxIds.rda', compress="xz")
+}
diff --git a/man/GenomeInfoDbData-package.Rd b/man/GenomeInfoDbData-package.Rd
new file mode 100644
index 0000000..c80a8a5
--- /dev/null
+++ b/man/GenomeInfoDbData-package.Rd
@@ -0,0 +1,43 @@
+\name{GenomeInfoDb-package}
+
+\alias{GenomeInfoDb-package}
+\alias{speciesMap}
+\alias{validTaxIds}
+\alias{specData}
+
+\title{Species and taxonomy ID look up tables}
+
+\description{
+  This package contains three mapping objects:
+  \itemize{ 
+    \item speciesMap: A data frame with columns \sQuote{tax_id},
+          \sQuote{genus}, and \sQuote{species}. Used to retrieve taxonomy
+          ID by species and returns list of available species.
+    \item validTaxIds: An integer vector of valid taxonomy IDs created from
+          \code{speciesMap}. Used internally for quick taxonomy ID look ups.
+    \item specData: A data frame with columns \sQuote{taxon} and
+          \sQuote{species}. Used internally to retrieve species by taxonomy ID.
+  }
+}
+
+\details{
+  Scripts to generate these files are in GenomeInfoDbData/inst/scripts. All
+  originate from the taxdummp download at
+  \url{ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz}.
+}
+
+\usage{
+data(speciesMap)
+data(validTaxIds)
+data(specData)
+}
+
+\examples{
+data(speciesMap)
+sapply(speciesMap, class)   #       taxon     species 
+                            #    "integer" "character" 
+subset(speciesMap, species=="Homo sapiens")$taxon # [1] 9606
+}
+
+\keyword{datasets}
+\author{Bioconductor Core Team}

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/r-bioc-genomeinfodbdata.git