[med-svn] [r-bioc-xvector] 04/07: New upstream version 0.18.0
Andreas Tille
tille at debian.org
Wed Nov 8 10:01:55 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository r-bioc-xvector.
commit c6f4066f6b0957f01fa74ad7c9aac5b9a4b1dccc
Author: Andreas Tille <tille at debian.org>
Date: Wed Nov 8 10:56:29 2017 +0100
New upstream version 0.18.0
---
DESCRIPTION | 14 +-
R/RDS-random-access.R | 47 +++
inst/include/XVector_interface.h | 6 +
inst/include/_XVector_stubs.c | 5 +
src/RDS_random_access.c | 829 +++++++++++++++++++++++++++++++++++++++
src/R_init_XVector.c | 6 +
src/XVector.h | 26 ++
src/io_utils.c | 43 ++
8 files changed, 969 insertions(+), 7 deletions(-)
diff --git a/DESCRIPTION b/DESCRIPTION
index 7187892..9a28376 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,25 +1,25 @@
Package: XVector
-Title: Representation and manpulation of external sequences
+Title: Representation and manipulation of external sequences
Description: Memory efficient S4 classes for storing sequences "externally"
(behind an R external pointer, or on disk).
-Version: 0.16.0
+Version: 0.18.0
Encoding: UTF-8
Author: Hervé Pagès and Patrick Aboyoun
Maintainer: Hervé Pagès <hpages at fredhutch.org>
biocViews: Infrastructure, DataRepresentation
Depends: R (>= 2.8.0), methods, BiocGenerics (>= 0.19.2), S4Vectors (>=
- 0.13.13), IRanges (>= 2.9.18)
+ 0.15.14), IRanges (>= 2.9.18)
Imports: methods, zlibbioc, BiocGenerics, S4Vectors, IRanges
LinkingTo: S4Vectors, IRanges
Suggests: Biostrings, drosophila2probe, RUnit
License: Artistic-2.0
-Collate: io-utils.R SharedVector-class.R SharedRaw-class.R
- SharedInteger-class.R SharedDouble-class.R XVector-class.R
- XRaw-class.R XInteger-class.R XDouble-class.R
+Collate: io-utils.R RDS-random-access.R SharedVector-class.R
+ SharedRaw-class.R SharedInteger-class.R SharedDouble-class.R
+ XVector-class.R XRaw-class.R XInteger-class.R XDouble-class.R
XVectorList-class.R XRawList-class.R XRawList-comparison.R
XIntegerViews-class.R XDoubleViews-class.R OnDiskRaw-class.R
RdaCollection-class.R intra-range-methods.R compact-methods.R
reverse-methods.R slice-methods.R view-summarization-methods.R
updateObject-methods.R zzz.R
NeedsCompilation: yes
-Packaged: 2017-04-24 23:30:38 UTC; biocbuild
+Packaged: 2017-10-30 23:40:38 UTC; biocbuild
diff --git a/R/RDS-random-access.R b/R/RDS-random-access.R
new file mode 100644
index 0000000..9bbc79a
--- /dev/null
+++ b/R/RDS-random-access.R
@@ -0,0 +1,47 @@
+### =========================================================================
+### Random access to the elements of a serialized atomic vector or array
+### -------------------------------------------------------------------------
+
+
+### Should probably move this to R/io-utils.R
+.open_input_file <- function(file)
+{
+ filexp_list <- open_input_files(file)
+ stopifnot(length(filexp_list) == 1L)
+ filexp_list[[1L]]
+}
+
+.read_RDS_file <- function(file, mode, attribs_dump=NULL)
+{
+ filexp <- .open_input_file(file)
+ .Call("RDS_read_file", filexp, mode, attribs_dump, PACKAGE="XVector")
+}
+
+read_RDS <- function(file, attribs.only=FALSE)
+{
+ mode <- if (attribs.only) 3L else 0L
+ attribs_dump <- new.env(parent=emptyenv())
+ ans <- .read_RDS_file(file, mode, attribs_dump=attribs_dump)
+ if (attribs.only)
+ ans <- attribs_dump
+ ans
+}
+
+read_RDS_typeof_and_length <- function(file) .read_RDS_file(file, 4L)
+
+extract_subvector_from_RDS_vector <- function(file, pos)
+{
+ filexp <- .open_input_file(file)
+ .Call("RDS_extract_subvector", filexp, pos, PACKAGE="XVector")
+}
+
+extract_subarray_from_RDS_array <- function(file, index)
+{
+ attribs_dump <- read_RDS(file, attribs.only=TRUE)
+ x_dim <- try(get("dim", envir=attribs_dump, inherits=FALSE), silent=TRUE)
+ if (inherits(x_dim , "try-error"))
+ stop("serialized object is not an array")
+ filexp <- .open_input_file(file)
+ .Call("RDS_extract_subarray", filexp, x_dim, index, PACKAGE="XVector")
+}
+
diff --git a/inst/include/XVector_interface.h b/inst/include/XVector_interface.h
index 548d317..bb536a9 100644
--- a/inst/include/XVector_interface.h
+++ b/inst/include/XVector_interface.h
@@ -16,6 +16,12 @@
* io_utils.c
*/
+int filexp_read(
+ SEXP filexp,
+ char *buf,
+ int buf_size
+);
+
int filexp_gets(
SEXP filexp,
char *buf,
diff --git a/inst/include/_XVector_stubs.c b/inst/include/_XVector_stubs.c
index 152d6ac..51cc45f 100644
--- a/inst/include/_XVector_stubs.c
+++ b/inst/include/_XVector_stubs.c
@@ -31,6 +31,11 @@ void stubname Targs \
* Stubs for callables defined in io_utils.c
*/
+DEFINE_CCALLABLE_STUB(int, filexp_read,
+ (SEXP filexp, char *buf, int buf_size),
+ ( filexp, buf, buf_size)
+)
+
DEFINE_CCALLABLE_STUB(int, filexp_gets,
(SEXP filexp, char *buf, int buf_size, int *EOL_in_buf),
( filexp, buf, buf_size, EOL_in_buf)
diff --git a/src/RDS_random_access.c b/src/RDS_random_access.c
new file mode 100644
index 0000000..982ccc5
--- /dev/null
+++ b/src/RDS_random_access.c
@@ -0,0 +1,829 @@
+/****************************************************************************
+ ****************************************************************************
+ * Random access to the elements of a serialized atomic vector or array *
+ * Author: H. Pag\`es *
+ ****************************************************************************
+ ****************************************************************************/
+#include "XVector.h"
+#include "IRanges_interface.h"
+#include "S4Vectors_interface.h"
+
+#include <limits.h> // for INT_MAX
+
+static int verbose = 0;
+
+
+#define IS_ATOMIC_TYPE(type) \
+ ((type) == LGLSXP || (type) == INTSXP || (type) == REALSXP || \
+ (type) == CPLXSXP || (type) == RAWSXP || (type) == STRSXP)
+
+/* Equivalent to the DATAPTR() macro defined in Rinternals.h
+ For some reason I don't understand, I can't use the DATAPTR() macro. */
+static void *dataptr(SEXP x)
+{
+ switch (TYPEOF(x)) {
+ case LGLSXP:
+ return LOGICAL(x);
+ case INTSXP:
+ return INTEGER(x);
+ case REALSXP:
+ return REAL(x);
+ case CPLXSXP:
+ return COMPLEX(x);
+ case RAWSXP:
+ return RAW(x);
+ }
+ error("XVector internal error in dataptr(): "
+ "%s type not supported", CHAR(type2str(TYPEOF(x))));
+}
+
+static size_t type2atomsize(SEXPTYPE type)
+{
+ switch (type) {
+ case LGLSXP:
+ case INTSXP:
+ return sizeof(int);
+ case REALSXP:
+ return sizeof(double);
+ case CPLXSXP:
+ return sizeof(Rcomplex);
+ case RAWSXP:
+ return sizeof(Rbyte);
+ }
+ error("XVector internal error in type2atomsize(): "
+ "undefined atom size for type %s", CHAR(type2str(type)));
+}
+
+
+/****************************************************************************
+ * A simple RDS parser
+ *
+ * The current implementation assumes that:
+ * - sizeof(int) = 4
+ * - sizeof(double) = sizeof(long long int) = 8
+ * - platform is little endian
+ */
+
+static void printf_margin(int indent)
+{
+ int i;
+
+ for (i = 0; i < indent; i++)
+ printf(" ");
+ return;
+}
+
+#define PRINTIFVERBOSE1(msg) \
+{ \
+ if (verbose) { \
+ printf_margin(indent); \
+ printf(msg); \
+ printf("\n"); \
+ } \
+}
+#define PRINTIFVERBOSE2(format, value) \
+{ \
+ if (verbose) { \
+ printf_margin(indent); \
+ printf(format, value); \
+ printf("\n"); \
+ } \
+}
+
+static SEXPTYPE RDStype2Rtype(unsigned char type)
+{
+ /* NULL type is 0xfe in RDS, not NILSXP */
+ return type == 0xfe ? NILSXP : type;
+}
+
+static const char *RDS_read_bytes(SEXP filexp, size_t n, int parse_only,
+ unsigned char *buf)
+{
+ int n2, n3;
+ static char errmsg_buf[40];
+
+ /* Because the 'buf_size' argument in _filexp_read() must be an
+ int, we cannot read more than INT_MAX bytes per call to
+ _filexp_read(). */
+ while (n > 0) {
+ n2 = n <= INT_MAX ? n : INT_MAX;
+ if (parse_only) {
+ _filexp_seek(filexp, n2, SEEK_CUR);
+ } else {
+ n3 = _filexp_read(filexp, (char *) buf, n2);
+ if (n3 != n2) {
+ snprintf(errmsg_buf, sizeof(errmsg_buf),
+ "read error or unexpected end of file");
+ return errmsg_buf;
+ }
+ buf += n2;
+ }
+ n -= n2;
+ }
+ return NULL;
+}
+
+static void RDS_read_chars(SEXP filexp, size_t n, int parse_only,
+ CharAE *string_buf)
+{
+ const char *errmsg;
+
+ if (!parse_only && n > string_buf->_buflength)
+ CharAE_extend(string_buf, n);
+ errmsg = RDS_read_bytes(filexp, n, parse_only,
+ (unsigned char *) string_buf->elts);
+ if (errmsg != NULL)
+ error(errmsg);
+ if (!parse_only)
+ CharAE_set_nelt(string_buf, n);
+ return;
+}
+
+static void swap_4_bytes(unsigned char *bytes)
+{
+ unsigned int *tmp;
+
+ tmp = (unsigned int *) bytes;
+ *tmp = (*tmp << 24) |
+ ((*tmp & 0xff00) << 8) |
+ ((*tmp & 0xff0000) >> 8) |
+ (*tmp >> 24);
+ return;
+}
+
+static const char *RDS_read_ints(SEXP filexp, size_t n, int parse_only,
+ int *buf)
+{
+ const char *errmsg;
+ size_t i;
+
+ /* Integer values are *always* 4 bytes in an RDS file, even if
+ sizeof(int) != 4 on the machine running this code! */
+ errmsg = RDS_read_bytes(filexp, n * 4, parse_only,
+ (unsigned char *) buf);
+ if (errmsg != NULL)
+ return errmsg;
+ /* FIXME: Don't swap bytes if platform is big endian */
+ if (!parse_only)
+ for (i = 0; i < n; i++)
+ swap_4_bytes((unsigned char *) (buf + i));
+ return NULL;
+}
+
+static void swap_8_bytes(unsigned char *bytes)
+{
+ unsigned long long int *tmp;
+
+ tmp = (unsigned long long int *) bytes;
+ *tmp = (*tmp << 56) |
+ ((*tmp & 0xff00) << 40) |
+ ((*tmp & 0xff0000) << 24) |
+ ((*tmp & 0xff000000) << 8) |
+ ((*tmp & 0xff00000000) >> 8) |
+ ((*tmp & 0xff0000000000) >> 24) |
+ ((*tmp & 0xff000000000000) >> 40) |
+ (*tmp >> 56);
+ return;
+}
+
+static const char *RDS_read_doubles(SEXP filexp, size_t n, int parse_only,
+ double *buf)
+{
+ const char *errmsg;
+ size_t i;
+
+ /* Double values are *always* 8 bytes in an RDS file, even if
+ sizeof(double) != 8 on the machine running this code! */
+ errmsg = RDS_read_bytes(filexp, n * 8, parse_only,
+ (unsigned char *) buf);
+ if (errmsg != NULL)
+ return errmsg;
+ /* FIXME: Don't swap bytes if platform is big endian */
+ if (!parse_only)
+ for (i = 0; i < n; i++)
+ swap_8_bytes((unsigned char *) (buf + i));
+ return NULL;
+}
+
+static R_xlen_t RDS_read_vector_length(SEXP filexp)
+{
+ const char *errmsg;
+ const unsigned char LONG_LENGTH_bytes[4] = {0xff, 0xff, 0xff, 0xff};
+ unsigned char buf[8];
+ int *length;
+ long long int *long_length;
+
+ errmsg = RDS_read_bytes(filexp, 4, 0, buf);
+ if (errmsg != NULL)
+ error(errmsg);
+ if (memcmp(buf, LONG_LENGTH_bytes, 4) != 0) {
+ swap_4_bytes(buf);
+ length = (int *) buf;
+ return (R_xlen_t) *length;
+ }
+ errmsg = RDS_read_bytes(filexp, 8, 0, buf);
+ if (errmsg != NULL)
+ error(errmsg);
+ swap_8_bytes(buf);
+ long_length = (long long int *) buf;
+ return (R_xlen_t) *long_length;
+}
+
+SEXP get_typeof_and_length_as_list(SEXP filexp, SEXPTYPE type)
+{
+ R_xlen_t length;
+ SEXP ans, ans_elt, ans_names, ans_names_elt;
+
+ length = type == NILSXP ? 0 : RDS_read_vector_length(filexp);
+
+ ans = PROTECT(NEW_LIST(2));
+
+ /* Set "typeof" element. */
+ ans_elt = PROTECT(ScalarString(type2str(type)));
+ SET_VECTOR_ELT(ans, 0, ans_elt);
+ UNPROTECT(1);
+ /* Set "length" element. */
+ if (length <= INT_MAX)
+ ans_elt = PROTECT(ScalarInteger((int) length));
+ else
+ ans_elt = PROTECT(ScalarReal((double) length));
+ SET_VECTOR_ELT(ans, 1, ans_elt);
+ UNPROTECT(1);
+
+ ans_names = PROTECT(NEW_CHARACTER(2));
+ ans_names_elt = PROTECT(mkChar("typeof"));
+ SET_STRING_ELT(ans_names, 0, ans_names_elt);
+ UNPROTECT(1);
+ ans_names_elt = PROTECT(mkChar("length"));
+ SET_STRING_ELT(ans_names, 1, ans_names_elt);
+ UNPROTECT(1);
+ SET_NAMES(ans, ans_names);
+ UNPROTECT(1);
+
+ UNPROTECT(1);
+ return ans;
+}
+
+/* Encoded strings not supported. */
+static int RDS_read_string(SEXP filexp, int parse_only, CharAE *string_buf)
+{
+ const char *errmsg;
+ const unsigned char NA_STRING_bytes[4] = {0xff, 0xff, 0xff, 0xff};
+ unsigned char buf[4];
+ R_xlen_t ans_len;
+
+ errmsg = RDS_read_bytes(filexp, sizeof(buf), 0, buf);
+ if (errmsg != NULL)
+ error(errmsg);
+ if (buf[0] != 0 || buf[2] != 0 || buf[3] != 0x09)
+ error("unsupported RDS file");
+ if (buf[1] == 0) {
+ errmsg = RDS_read_bytes(filexp, sizeof(buf), 0, buf);
+ if (errmsg != NULL)
+ error(errmsg);
+ if (memcmp(buf, NA_STRING_bytes, sizeof(buf)) != 0)
+ error("unsupported RDS file");
+ return 1;
+ }
+ if (buf[1] != 0x04)
+ error("unsupported string header");
+ ans_len = RDS_read_vector_length(filexp);
+ RDS_read_chars(filexp, (size_t) ans_len, parse_only, string_buf);
+ return 0;
+}
+
+/* Return R_NilValue if parse_only != 0. */
+static SEXP RDS_read_character_vector(SEXP filexp, int parse_only,
+ CharAE *string_buf, int indent)
+{
+ R_xlen_t ans_len, i;
+ int is_na;
+ SEXP ans, ans_elt;
+
+ PRINTIFVERBOSE1("start reading character vector");
+ ans_len = RDS_read_vector_length(filexp);
+ PRINTIFVERBOSE2("object length: %td", ans_len);
+ ans = parse_only ? R_NilValue : PROTECT(NEW_CHARACTER(ans_len));
+ for (i = 0; i < ans_len; i++) {
+ is_na = RDS_read_string(filexp, parse_only, string_buf);
+ if (parse_only)
+ continue;
+ if (is_na) {
+ SET_STRING_ELT(ans, i, NA_STRING);
+ } else {
+ PROTECT(ans_elt = new_CHARSXP_from_CharAE(string_buf));
+ SET_STRING_ELT(ans, i, ans_elt);
+ UNPROTECT(1);
+ }
+ }
+ if (!parse_only)
+ UNPROTECT(1);
+ PRINTIFVERBOSE1("done reading character vector");
+ return ans;
+}
+
+/* Return R_NilValue if parse_only != 0. */
+static SEXP RDS_read_atomic_vector(SEXP filexp, SEXPTYPE type,
+ int parse_only, int indent)
+{
+ R_xlen_t ans_len;
+ SEXP ans;
+ const char *errmsg;
+
+ PRINTIFVERBOSE2("start reading %s vector", CHAR(type2str(type)));
+ ans_len = RDS_read_vector_length(filexp);
+ PRINTIFVERBOSE2("object length: %td", ans_len);
+ ans = parse_only ? R_NilValue : PROTECT(allocVector(type, ans_len));
+ switch (type) {
+ case LGLSXP:
+ case INTSXP:
+ errmsg = RDS_read_ints(filexp, (size_t) ans_len,
+ parse_only, parse_only ? NULL : dataptr(ans));
+ break;
+ case REALSXP:
+ errmsg = RDS_read_doubles(filexp, (size_t) ans_len,
+ parse_only, parse_only ? NULL : dataptr(ans));
+ break;
+ case CPLXSXP:
+ errmsg = RDS_read_doubles(filexp, (size_t) ans_len * 2,
+ parse_only, parse_only ? NULL : dataptr(ans));
+ break;
+ case RAWSXP:
+ errmsg = RDS_read_bytes(filexp, (size_t) ans_len,
+ parse_only, parse_only ? NULL : dataptr(ans));
+ break;
+ default:
+ error("XVector internal error in RDS_read_atomic_vector(): "
+ "unexpected type: %s", CHAR(type2str(type)));
+ }
+ if (errmsg != NULL)
+ error(errmsg);
+ if (!parse_only)
+ UNPROTECT(1);
+ PRINTIFVERBOSE2("done reading %s vector", CHAR(type2str(type)));
+ return ans;
+}
+
+static SEXP RDS_read_object(SEXP filexp, int mode, SEXP attribs_dump,
+ CharAE *string_buf, CharAEAE *symbols_buf, int indent);
+
+/* Return R_NilValue if parse_only != 0. */
+static SEXP RDS_read_list(SEXP filexp, int parse_only,
+ CharAE *string_buf, CharAEAE *symbols_buf, int indent)
+{
+ R_xlen_t ans_len, i;
+ SEXP ans, ans_elt;
+
+ PRINTIFVERBOSE1("start reading list object");
+ ans_len = RDS_read_vector_length(filexp);
+ PRINTIFVERBOSE2("object length: %td", ans_len);
+ ans = parse_only ? R_NilValue : PROTECT(NEW_LIST(ans_len));
+ for (i = 0; i < ans_len; i++) {
+ ans_elt = RDS_read_object(filexp, parse_only, R_NilValue,
+ string_buf, symbols_buf,
+ indent + 1);
+ if (parse_only)
+ continue;
+ PROTECT(ans_elt);
+ SET_VECTOR_ELT(ans, i, ans_elt);
+ UNPROTECT(1);
+ }
+ if (!parse_only)
+ UNPROTECT(1);
+ PRINTIFVERBOSE1("done reading list object");
+ return ans;
+}
+
+static int RDS_read_attrib_separator(SEXP filexp)
+{
+ const char *errmsg;
+ const unsigned char EOA_bytes[4] = {0x00, 0x00, 0x00, 0xfe},
+ ATTRIB_SEP_bytes[4] = {0x00, 0x00, 0x04, 0x02};
+ unsigned char buf[4];
+
+ errmsg = RDS_read_bytes(filexp, sizeof(buf), 0, buf);
+ if (errmsg != NULL)
+ error(errmsg);
+ if (memcmp(buf, EOA_bytes, sizeof(buf)) == 0)
+ return 0;
+ if (memcmp(buf, ATTRIB_SEP_bytes, sizeof(buf)) != 0)
+ error("unrecognized attribute header");
+ return 1;
+}
+
+/* Store symbol (as 0-terminated string) in one of 'symbols_buf' elements.
+ Return the "key" of this element i.e. its 0-based index in 'symbols_buf'. */
+static unsigned int RDS_read_symbol(SEXP filexp, CharAEAE *symbols_buf,
+ int indent)
+{
+ const char *errmsg;
+ const unsigned char NEW_SYMBOL_bytes[4] = {0x00, 0x00, 0x00, 0x01};
+ unsigned char buf[4];
+ unsigned int key;
+ CharAE *namebuf;
+
+ PRINTIFVERBOSE1("start reading symbol");
+ errmsg = RDS_read_bytes(filexp, sizeof(buf), 0, buf);
+ if (errmsg != NULL)
+ error(errmsg);
+ if (memcmp(buf, NEW_SYMBOL_bytes, sizeof(buf)) == 0) {
+ /* New symbol. */
+ namebuf = new_CharAE(0);
+ if (RDS_read_string(filexp, 0, namebuf))
+ error("invalid symbol (NA)");
+ CharAE_insert_at(namebuf, CharAE_get_nelt(namebuf), '\0');
+ key = CharAEAE_get_nelt(symbols_buf);
+ CharAEAE_insert_at(symbols_buf, key, namebuf);
+ } else {
+ /* Known symbol (i.e. already in 'symbols_buf'). */
+ key = (((unsigned int) buf[0]) << 16) |
+ (((unsigned int) buf[1]) << 8) |
+ ((unsigned int) buf[2]);
+ if (buf[3] != 0xff || key == 0)
+ error("unsupported symbol specifier");
+ key--;
+ }
+ PRINTIFVERBOSE2("done reading symbol [%s]",
+ symbols_buf->elts[key]->elts);
+ return key;
+}
+
+/* Always parse the full attributes.
+ In mode 0: Load and set the attributes on 'object', return R_NilValue.
+ In mode 1: (parse-only mode) Don't load anything, don't set anything on
+ 'object' (which should be R_NilValue), and return R_NilValue;
+ In mode 2: Load the attributes and do NOT set them on 'object' ('object'
+ is ignored), but dump them in the 'attribs_dump' environment. */
+static void RDS_read_attribs(SEXP filexp, int mode,
+ SEXP object, SEXP attribs_dump,
+ CharAE *string_buf, CharAEAE *symbols_buf, int indent)
+{
+ unsigned int key;
+ SEXP attrval;
+ const char *symbol;
+
+ PRINTIFVERBOSE1("start reading object attributes");
+ while (RDS_read_attrib_separator(filexp)) {
+ key = RDS_read_symbol(filexp, symbols_buf, indent + 1);
+ attrval = RDS_read_object(filexp, mode == 1, R_NilValue,
+ string_buf, symbols_buf,
+ indent + 1);
+ if (mode == 1)
+ continue;
+ PROTECT(attrval);
+ symbol = symbols_buf->elts[key]->elts;
+ if (mode == 0)
+ setAttrib(object, install(symbol), attrval);
+ else // mode 2
+ defineVar(install(symbol), attrval, attribs_dump);
+ UNPROTECT(1);
+ }
+ PRINTIFVERBOSE1("done reading object attributes");
+ return;
+}
+
+static SEXP RDS_read_object(SEXP filexp, int mode, SEXP attribs_dump,
+ CharAE *string_buf, CharAEAE *symbols_buf, int indent)
+{
+ const char *errmsg;
+ unsigned char obj_header[4];
+ int has_attribs;
+ SEXPTYPE type;
+ SEXP ans;
+
+ PRINTIFVERBOSE1("start reading object header");
+ errmsg = RDS_read_bytes(filexp, sizeof(obj_header), 0, obj_header);
+ if (errmsg != NULL)
+ error(errmsg);
+ if (obj_header[0] != 0 || obj_header[1] != 0)
+ error("unsupported RDS file");
+ PRINTIFVERBOSE1("done reading object header");
+ if (obj_header[2] == 0) {
+ /* Object has no attributes. */
+ if (mode == 3)
+ return R_NilValue; // early bail out
+ has_attribs = 0;
+ } else if (obj_header[2] == 0x02 || obj_header[2] == 0x03) {
+ /* Object has attributes (code 0x03 seems to be specific
+ to factors). */
+ if (mode == 3)
+ mode = 2;
+ has_attribs = 1;
+ } else {
+ error("unexpected 3rd byte in object header");
+ }
+ type = RDStype2Rtype(obj_header[3]);
+ PRINTIFVERBOSE2("object type: %s", CHAR(type2str(type)));
+ if (mode == 4)
+ return get_typeof_and_length_as_list(filexp, type);
+ if (type == NILSXP) {
+ ans = R_NilValue;
+ } else if (type == STRSXP) {
+ ans = RDS_read_character_vector(filexp, mode != 0,
+ string_buf, indent);
+ } else if (IS_ATOMIC_TYPE(type)) {
+ ans = RDS_read_atomic_vector(filexp, type, mode != 0,
+ indent);
+ } else if (type == VECSXP) {
+ ans = RDS_read_list(filexp, mode != 0,
+ string_buf, symbols_buf, indent);
+ } else {
+ error("RDS parser does not support type: %s",
+ CHAR(type2str(type)));
+ }
+ if (has_attribs) {
+ if (!isNull(ans))
+ PROTECT(ans);
+ RDS_read_attribs(filexp, mode, ans, attribs_dump,
+ string_buf, symbols_buf, indent);
+ if (!isNull(ans))
+ UNPROTECT(1);
+ }
+ return ans;
+}
+
+static void RDS_read_file_header(SEXP filexp)
+{
+ const char *errmsg;
+ const unsigned char RDS_header[14] = {0x58, 0x0a,
+ 0x00, 0x00, 0x00, 0x02,
+ 0x00, 0x03, 0x04, 0x02,
+ 0x00, 0x02, 0x03, 0x00};
+ unsigned char file_header[sizeof(RDS_header)];
+ int indent;
+
+ indent = 0;
+ PRINTIFVERBOSE1("start reading file header");
+ errmsg = RDS_read_bytes(filexp, sizeof(file_header), 0, file_header);
+ if (errmsg != NULL)
+ error(errmsg);
+ if (memcmp(file_header, RDS_header, sizeof(file_header)) != 0)
+ error("does not look like an RDS file");
+ PRINTIFVERBOSE1("done reading file header");
+ return;
+}
+
+
+/****************************************************************************
+ * RDS_read_file()
+ *
+ * --- .Call ENTRY POINT ---
+ * Read/parse an RDS file. Only support a serialized atomic vector or a NULL
+ * or a list (possibly nested) made of the formers. Support attributes (if
+ * made of the formers).
+ * Args:
+ * filexp: External pointer to a FILE pointer.
+ * mode: Control what parts of the object to load. In modes 0, 1, 2 the
+ * full object gets parsed:
+ * mode 0: Load everything and return the full object.
+ * mode 1: (parse-only mode) Don't load anything and return
+ * R_NilValue.
+ * mode 2: Load only the attributes and dump them in the
+ * 'attribs_dump' environment.
+ * Mode 3 is like mode 2 but with early bailout if the object header
+ * indicates that the object has no attributes. So in this mode the
+ * object gets fully parsed only if it has attributes. Otherwise
+ * only its header gets parsed.
+ * In mode 4 only the object header and length get parsed.
+ * attribs_dump: Environment used in modes 2 and 3 to dump the attributes.
+ */
+SEXP RDS_read_file(SEXP filexp, SEXP mode, SEXP attribs_dump)
+{
+ int mode0;
+ CharAE *string_buf;
+ CharAEAE *symbols_buf;
+
+ RDS_read_file_header(filexp);
+ mode0 = INTEGER(mode)[0];
+ string_buf = new_CharAE(0);
+ symbols_buf = new_CharAEAE(0, 0);
+ return RDS_read_object(filexp, mode0, attribs_dump,
+ string_buf, symbols_buf, 1);
+}
+
+
+/****************************************************************************
+ * RDS_extract_subvector()
+ */
+
+static SEXPTYPE extract_top_level_object_type(SEXP filexp)
+{
+ const char *errmsg;
+ unsigned char obj_header[4];
+ SEXPTYPE x_type;
+
+ RDS_read_file_header(filexp);
+ errmsg = RDS_read_bytes(filexp, sizeof(obj_header), 0, obj_header);
+ if (errmsg != NULL)
+ error(errmsg);
+ x_type = RDStype2Rtype(obj_header[3]);
+ if (!IS_ATOMIC_TYPE(x_type) || x_type == STRSXP)
+ error("extracting elements from a serialized object of "
+ "type %s is not supported", CHAR(type2str(x_type)));
+ return x_type;
+}
+
+static const char *get_pos(int pos_type, const void *pos,
+ R_xlen_t i, long long int *pos_elt)
+{
+ int tmp0, is_na;
+ double tmp1;
+ long long int tmp2;
+ static char errmsg_buf[80];
+
+ switch (pos_type) {
+ case 0: // 'pos' contains int values
+ tmp0 = ((const int *) pos)[i];
+ is_na = tmp0 == NA_INTEGER;
+ *pos_elt = (long long int) tmp0;
+ break;
+ case 1: // 'pos' contains double values
+ tmp1 = ((const double *) pos)[i];
+ is_na = ISNAN(tmp1);
+ *pos_elt = (long long int) tmp1;
+ break;
+ case 2: // 'pos' contains long long int values
+ tmp2 = ((const long long int *) pos)[i];
+ is_na = tmp2 == NA_LLINT;
+ *pos_elt = tmp2;
+ break;
+ default:
+ snprintf(errmsg_buf, sizeof(errmsg_buf),
+ "XVector internal error in get_pos(): "
+ "unsupported 'pos' type");
+ return errmsg_buf;
+ }
+ if (is_na) {
+ snprintf(errmsg_buf, sizeof(errmsg_buf),
+ "'pos' cannot contain NAs");
+ return errmsg_buf;
+ }
+ return NULL;
+}
+
+static void RDS_read_atom_at_offset(SEXP filexp,
+ long long int offset, SEXP ans, R_xlen_t i)
+{
+ size_t n;
+ const char *errmsg;
+
+ if (offset < 0)
+ error("positions of elements to extract must be sorted");
+ n = offset * type2atomsize(TYPEOF(ans));
+ errmsg = RDS_read_bytes(filexp, n, 1, NULL);
+ if (errmsg != NULL)
+ error(errmsg);
+ switch (TYPEOF(ans)) {
+ case LGLSXP:
+ errmsg = RDS_read_ints(filexp, 1, 0, LOGICAL(ans) + i);
+ break;
+ case INTSXP:
+ errmsg = RDS_read_ints(filexp, 1, 0, INTEGER(ans) + i);
+ break;
+ case REALSXP:
+ errmsg = RDS_read_doubles(filexp, 1, 0, REAL(ans) + i);
+ break;
+ case CPLXSXP:
+ errmsg = RDS_read_doubles(filexp, 2, 0,
+ (double *) (COMPLEX(ans) + i));
+ break;
+ case RAWSXP:
+ errmsg = RDS_read_bytes(filexp, 1, 0, RAW(ans) + i);
+ break;
+ default:
+ error("XVector internal error in RDS_read_atom_at_offset(): "
+ "unexpected type: %s", CHAR(type2str(TYPEOF(ans))));
+ }
+ if (errmsg != NULL)
+ error(errmsg);
+ return;
+}
+
+static const char *RDS_read_atoms_at_positions(SEXP filexp,
+ R_xlen_t x_len, int pos_type, const void *pos, SEXP ans)
+{
+ long long int pos_elt, prev_pos_elt, offset;
+ R_xlen_t i;
+ const char *errmsg;
+ static char errmsg_buf[40];
+
+ prev_pos_elt = 0;
+ for (i = 0; i < XLENGTH(ans); i++) {
+ errmsg = get_pos(pos_type, pos, i, &pos_elt);
+ if (errmsg != NULL)
+ return errmsg;
+ if (pos_elt < 1 || pos_elt > x_len) {
+ snprintf(errmsg_buf, sizeof(errmsg_buf),
+ "'pos' contains invalid positions");
+ return errmsg_buf;
+ }
+ offset = pos_elt - prev_pos_elt - 1;
+ RDS_read_atom_at_offset(filexp, offset, ans, i);
+ prev_pos_elt = pos_elt;
+ }
+ return NULL;
+}
+
+/* --- .Call ENTRY POINT ---
+ * Random access to the elements of a serialized atomic vector.
+ * Character vectors not supported.
+ * Args:
+ * filexp: External pointer to a FILE pointer.
+ * pos: An integer, double, or LLint vector containing valid element
+ * positions in the serialized vector. The positions must be 1-based.
+ * So no NAs and all values must be >= 1 and <= vector length.
+ * In addition 'pos' must be sorted.
+ */
+SEXP RDS_extract_subvector(SEXP filexp, SEXP pos)
+{
+ SEXPTYPE x_type;
+ R_xlen_t x_len, pos_len;
+ int pos_type;
+ const void *pos_dataptr;
+ SEXP ans;
+ const char *errmsg;
+
+ /* Get type and length of serialized atomic vector. */
+ x_type = extract_top_level_object_type(filexp);
+ x_len = RDS_read_vector_length(filexp);
+
+ /* Get 'pos' length and pointer to data. */
+ if (IS_INTEGER(pos)) {
+ pos_type = 0;
+ pos_len = XLENGTH(pos);
+ pos_dataptr = INTEGER(pos);
+ } else if (IS_NUMERIC(pos)) {
+ pos_type = 1;
+ pos_len = XLENGTH(pos);
+ pos_dataptr = REAL(pos);
+ } else if (is_LLint(pos)) {
+ pos_type = 2;
+ pos_len = get_LLint_length(pos);
+ pos_dataptr = get_LLint_dataptr(pos);
+ } else {
+ error("'pos' must be an integer, double, or LLint vector");
+ }
+
+ ans = PROTECT(allocVector(x_type, pos_len));
+ errmsg = RDS_read_atoms_at_positions(filexp, x_len,
+ pos_type, pos_dataptr, ans);
+ UNPROTECT(1);
+ if (errmsg != NULL)
+ error(errmsg);
+ return ans;
+}
+
+
+/****************************************************************************
+ * RDS_extract_subarray()
+ */
+
+/* --- .Call ENTRY POINT ---
+ * Random access to the elements of a serialized array.
+ * Character arrays not supported.
+ * Args:
+ * filexp: External pointer to a FILE pointer.
+ * dim: The dimensions of the array. Typically extracted earlier with
+ * RDS_read_file(filexp, 3, attribs_dump).
+ * index: A list of subscripts as positive integer vectors. One vector of
+ * subscripts per array dimension. Each subscript must be sorted.
+ */
+SEXP RDS_extract_subarray(SEXP filexp, SEXP dim, SEXP index)
+{
+ SEXPTYPE x_type;
+ R_xlen_t x_len, dimprod;
+ int ndim, i;
+ SEXP subscript, ans;
+
+ /* Get type and length of serialized array. */
+ x_type = extract_top_level_object_type(filexp);
+ x_len = RDS_read_vector_length(filexp);
+
+ /* Check 'dim'. */
+ if (!IS_INTEGER(dim))
+ error("'dim' must be an integer vector");
+ ndim = LENGTH(dim);
+ dimprod = 1;
+ for (i = 0; i < ndim; i++)
+ dimprod *= INTEGER(dim)[i];
+ if (dimprod > x_len) // this is dangerous
+ error("supplied 'dim' implies that serialized array "
+ "has more elements than it effectively has");
+ if (dimprod < x_len) // this is not
+ warning("supplied 'dim' implies that serialized array "
+ "has less elements than it effectively has");
+
+ /* Check 'index'. */
+ if (!isVectorList(index)) // IS_LIST() is broken
+ error("'index' must be a list");
+ if (LENGTH(index) != ndim)
+ error("'index' must have the same length as 'dim'");
+ for (i = 0; i < ndim; i++) {
+ subscript = VECTOR_ELT(index, i);
+ if (!IS_INTEGER(subscript))
+ error("all subscripts in list 'index' must be "
+ "integer vectors");
+ }
+ return R_NilValue;
+}
+
diff --git a/src/R_init_XVector.c b/src/R_init_XVector.c
index 4d89709..27adf2d 100644
--- a/src/R_init_XVector.c
+++ b/src/R_init_XVector.c
@@ -13,6 +13,11 @@ static const R_CallMethodDef callMethods[] = {
CALLMETHOD_DEF(new_output_filexp, 4),
CALLMETHOD_DEF(finalize_filexp, 1),
+/* RDS_random_access.c */
+ CALLMETHOD_DEF(RDS_read_file, 3),
+ CALLMETHOD_DEF(RDS_extract_subvector, 2),
+ CALLMETHOD_DEF(RDS_extract_subarray, 3),
+
/* SharedVector_class.c */
CALLMETHOD_DEF(get_object_address, 1),
CALLMETHOD_DEF(get_list_addresses, 1),
@@ -94,6 +99,7 @@ void R_init_XVector(DllInfo *info)
R_registerRoutines(info, NULL, callMethods, NULL, NULL);
/* io_utils.c */
+ REGISTER_CCALLABLE(_filexp_read);
REGISTER_CCALLABLE(_filexp_gets);
REGISTER_CCALLABLE(_filexp_seek);
REGISTER_CCALLABLE(_filexp_rewind);
diff --git a/src/XVector.h b/src/XVector.h
index 0d4b3a7..06d8f75 100644
--- a/src/XVector.h
+++ b/src/XVector.h
@@ -12,6 +12,12 @@
/* io_utils.c */
+int _filexp_read(
+ SEXP filexp,
+ char *buf,
+ int buf_size
+);
+
int _filexp_gets(
SEXP filexp,
char *buf,
@@ -56,6 +62,26 @@ int _delete_trailing_LF_or_CRLF(
);
+/* RDS_random_access.c */
+
+SEXP RDS_read_file(
+ SEXP filexp,
+ SEXP mode,
+ SEXP attribs_dump
+);
+
+SEXP RDS_extract_subvector(
+ SEXP filexp,
+ SEXP pos
+);
+
+SEXP RDS_extract_subarray(
+ SEXP filexp,
+ SEXP dim,
+ SEXP index
+);
+
+
/* Ocopy_byteblocks.c */
void _Ocopy_byteblocks_from_i1i2(
diff --git a/src/io_utils.c b/src/io_utils.c
index 0a95f75..5084d29 100644
--- a/src/io_utils.c
+++ b/src/io_utils.c
@@ -101,6 +101,43 @@ static void iZFile_close(const ZFile *zfile)
}
/*
+ Here is how gzread() is declared in zlib.h:
+
+ int gzread(gzFile file, voidp buf, unsigned len);
+
+ And also, according to zlib.h:
+
+ gzread returns the number of uncompressed bytes actually read, less
+ than len for end of file, or -1 for error.
+
+ But gzread returns an int and len is an unsigned int so can be > INT_MAX.
+ So how can gzread return the number of uncompressed bytes actually read
+ when len is INT_MAX? Sounds like poor interface design to me.
+ So for iZFile_read(), we set the type of buf_size to int, not unsigned int.
+*/
+static int iZFile_read(const ZFile *zfile, char *buf, int buf_size)
+{
+ int ztype;
+ void *file;
+
+ ztype = zfile->ztype;
+ file = zfile->file;
+ switch (ztype) {
+ case UNCOMPRESSED:
+ case GZ_TYPE:
+ return gzread((gzFile) file, buf, (unsigned int) buf_size);
+//#ifndef _WIN32
+// case BZ2_TYPE:
+// break;
+//#endif
+ default:
+ error(INTERNAL_ERR_IN "iZFile_read(): "
+ "invalid ztype value %d", ztype);
+ }
+ return 0;
+}
+
+/*
* Similar to fgets()/gzgets(), except that it returns a code instead of
* NULL/Z_NULL or a pointer to the buffer. The returned code can be:
* 2: if reading stopped after an EOF or a newline,
@@ -437,6 +474,12 @@ static void ZFile_close(const ZFile *zfile)
} \
}
+int _filexp_read(SEXP filexp, char *buf, int buf_size)
+{
+ CHECK_USER_INTERRUPT(2000);
+ return iZFile_read(R_ExternalPtrAddr(filexp), buf, buf_size);
+}
+
int _filexp_gets(SEXP filexp, char *buf, int buf_size, int *EOL_in_buf)
{
CHECK_USER_INTERRUPT(2000);
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/r-bioc-xvector.git
More information about the debian-med-commit
mailing list