[med-svn] [r-cran-sourcetools] 03/05: New upstream version 0.1.5
Andreas Tille
tille at debian.org
Thu Oct 12 16:46:20 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository r-cran-sourcetools.
commit c56394c72ca6d9a85796110c6092cce62a7f8120
Author: Andreas Tille <tille at debian.org>
Date: Thu Oct 12 18:43:52 2017 +0200
New upstream version 0.1.5
---
DESCRIPTION | 21 +
LICENSE | 2 +
MD5 | 48 ++
NAMESPACE | 11 +
NEWS.md | 36 ++
R/sourcetools.R | 98 ++++
R/util.R | 57 +++
README.md | 81 ++++
debian/README.test | 9 -
debian/changelog | 5 -
debian/compat | 1 -
debian/control | 25 -
debian/copyright | 33 --
debian/docs | 3 -
debian/rules | 5 -
debian/source/format | 1 -
debian/tests/control | 3 -
debian/tests/run-unit-test | 13 -
debian/watch | 2 -
inst/include/sourcetools.h | 12 +
inst/include/sourcetools/collection/Position.h | 77 +++
inst/include/sourcetools/collection/Range.h | 34 ++
inst/include/sourcetools/collection/collection.h | 7 +
inst/include/sourcetools/core/core.h | 7 +
inst/include/sourcetools/core/macros.h | 72 +++
inst/include/sourcetools/core/util.h | 142 ++++++
inst/include/sourcetools/cursor/TextCursor.h | 66 +++
inst/include/sourcetools/cursor/TokenCursor.h | 321 +++++++++++++
inst/include/sourcetools/cursor/cursor.h | 7 +
inst/include/sourcetools/multibyte/multibyte.h | 41 ++
inst/include/sourcetools/platform/platform.h | 20 +
inst/include/sourcetools/r/RCallRecurser.h | 75 +++
inst/include/sourcetools/r/RConverter.h | 39 ++
inst/include/sourcetools/r/RFunctions.h | 85 ++++
inst/include/sourcetools/r/RHeaders.h | 8 +
.../include/sourcetools/r/RNonStandardEvaluation.h | 149 ++++++
inst/include/sourcetools/r/RUtils.h | 100 ++++
inst/include/sourcetools/r/r.h | 11 +
inst/include/sourcetools/read/MemoryMappedReader.h | 139 ++++++
.../sourcetools/read/posix/FileConnection.h | 58 +++
.../read/posix/MemoryMappedConnection.h | 55 +++
inst/include/sourcetools/read/read.h | 24 +
.../sourcetools/read/windows/FileConnection.h | 50 ++
.../read/windows/MemoryMappedConnection.h | 51 ++
inst/include/sourcetools/tests/testthat.h | 14 +
.../sourcetools/tokenization/Registration.h | 190 ++++++++
inst/include/sourcetools/tokenization/Token.h | 522 +++++++++++++++++++++
inst/include/sourcetools/tokenization/Tokenizer.h | 463 ++++++++++++++++++
.../sourcetools/tokenization/tokenization.h | 8 +
inst/include/sourcetools/utf8/utf8.h | 115 +++++
man/read.Rd | 25 +
man/tokenize-methods.Rd | 42 ++
src/Makevars | 1 +
src/Makevars.win | 1 +
src/Reader.cpp | 88 ++++
src/Tokenizer.cpp | 96 ++++
tests/testthat.R | 4 +
tests/testthat/helper-utf8.R | 3 +
tests/testthat/test-read.R | 30 ++
tests/testthat/test-tokenize.R | 165 +++++++
60 files changed, 3771 insertions(+), 100 deletions(-)
diff --git a/DESCRIPTION b/DESCRIPTION
new file mode 100644
index 0000000..24bd541
--- /dev/null
+++ b/DESCRIPTION
@@ -0,0 +1,21 @@
+Package: sourcetools
+Type: Package
+Title: Tools for Reading, Tokenizing and Parsing R Code
+Version: 0.1.5
+Author: Kevin Ushey
+Maintainer: Kevin Ushey <kevinushey at gmail.com>
+Description: Tools for the reading and tokenization of R code. The
+ 'sourcetools' package provides both an R and C++ interface for the tokenization
+ of R code, and helpers for interacting with the tokenized representation of R
+ code.
+License: MIT + file LICENSE
+LazyData: TRUE
+Depends: R (>= 3.0.2)
+Suggests: testthat
+RoxygenNote: 5.0.1
+BugReports: https://github.com/kevinushey/sourcetools/issues
+Encoding: UTF-8
+NeedsCompilation: yes
+Packaged: 2016-09-14 22:38:37 UTC; kevin
+Repository: CRAN
+Date/Publication: 2016-09-15 03:07:07
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..a7bc902
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,2 @@
+YEAR: 2015-2016
+COPYRIGHT HOLDER: Kevin Ushey
diff --git a/MD5 b/MD5
new file mode 100644
index 0000000..56e91d2
--- /dev/null
+++ b/MD5
@@ -0,0 +1,48 @@
+f46bb7c8e02f465638a7a3f70bcbb76e *DESCRIPTION
+472904db5a93a07692b3fe24cabcf96c *LICENSE
+d904b6ab89c989c9148d4efba103d02d *NAMESPACE
+f236d8fcec934db9ad39b317d5308bd7 *NEWS.md
+1782b737100f74e11e90c8c45db0d509 *R/sourcetools.R
+b09019840734b467d388e34905ebc46c *R/util.R
+d82a27087d6f3fac9d06978a31640aed *README.md
+c5215eb66349006d73ad7e65ce54046b *inst/include/sourcetools.h
+f071c0148a4629ea8150b63d2627cf0c *inst/include/sourcetools/collection/Position.h
+019b4182b5d6b9700562f8d3b90ac1f4 *inst/include/sourcetools/collection/Range.h
+d11b4138653828a197304b1cb692c614 *inst/include/sourcetools/collection/collection.h
+d77f935ab3b7da52405a501f2404d18e *inst/include/sourcetools/core/core.h
+9ad041bb1ce4251ab7afccee1ac6de3d *inst/include/sourcetools/core/macros.h
+78bfe50df612b63dab8688d7bd8c8bbd *inst/include/sourcetools/core/util.h
+353bc7e4e2bf9b62e301243a8a631e52 *inst/include/sourcetools/cursor/TextCursor.h
+81d685fb305ce649d7d159d003b4b1b3 *inst/include/sourcetools/cursor/TokenCursor.h
+42d2f27e9ae85211cd5542eaecc7c37b *inst/include/sourcetools/cursor/cursor.h
+973781254e2b3ae94eb0770554efae33 *inst/include/sourcetools/multibyte/multibyte.h
+63818672820bc3a620d09e67c68af7c2 *inst/include/sourcetools/platform/platform.h
+f10cddd374f1c671456a4120496a4291 *inst/include/sourcetools/r/RCallRecurser.h
+78a60ba9c51951eb8b4211a0bd8bc998 *inst/include/sourcetools/r/RConverter.h
+5d890077972c2a4eb07c98aafc63690c *inst/include/sourcetools/r/RFunctions.h
+eab3a99f83f6bfa7b7cd0b7e8072edaa *inst/include/sourcetools/r/RHeaders.h
+dc5e82ab54673bd892e6b420896f101b *inst/include/sourcetools/r/RNonStandardEvaluation.h
+1e2951a40e7692881c7ab4645796a9ee *inst/include/sourcetools/r/RUtils.h
+da17972b93e9b4e91554f705b4cda985 *inst/include/sourcetools/r/r.h
+8ea2ac860e800d20143be7792483ba7c *inst/include/sourcetools/read/MemoryMappedReader.h
+8096eb102d9679287cc8fa705d3a21da *inst/include/sourcetools/read/posix/FileConnection.h
+e0147869348d196193c13e89762670f0 *inst/include/sourcetools/read/posix/MemoryMappedConnection.h
+44abbae26e9ab704c92a9ce6c98ba33a *inst/include/sourcetools/read/read.h
+e77d3eb6a47db7e6d1e65f59eb3ab2c5 *inst/include/sourcetools/read/windows/FileConnection.h
+f4b06a29aca570063567f8a765609056 *inst/include/sourcetools/read/windows/MemoryMappedConnection.h
+b80180f53ca809b9ba6d4e7df6316e0b *inst/include/sourcetools/tests/testthat.h
+c6afafc697b747111348dabb88d9fb4a *inst/include/sourcetools/tokenization/Registration.h
+9ab2cf85d30171f4744d21d10c6cd532 *inst/include/sourcetools/tokenization/Token.h
+6599c5daf2f3e59861153982cc00efef *inst/include/sourcetools/tokenization/Tokenizer.h
+00c91c6e20bc534fa3a689c79770f46c *inst/include/sourcetools/tokenization/tokenization.h
+1491ededa24449d40554757c96bebaf0 *inst/include/sourcetools/utf8/utf8.h
+3005e918c6f7dbf54993a04b74ca9e54 *man/read.Rd
+a94108446e930c7c488c695e1618f049 *man/tokenize-methods.Rd
+3f03da795dd26373156bddc78d41e95d *src/Makevars
+3f03da795dd26373156bddc78d41e95d *src/Makevars.win
+43927b22e2812e5ba35419390a7933ae *src/Reader.cpp
+78946eee022f534743af918d651dd244 *src/Tokenizer.cpp
+175dc27564828d1abeb87bc004d02266 *tests/testthat.R
+b6ba9001993894a2085c981a6c58018d *tests/testthat/helper-utf8.R
+0e31fb15ea8b66d310162f60c434ed7d *tests/testthat/test-read.R
+23eb599dfa50f8915e1448b88e1719a4 *tests/testthat/test-tokenize.R
diff --git a/NAMESPACE b/NAMESPACE
new file mode 100644
index 0000000..8a6b65c
--- /dev/null
+++ b/NAMESPACE
@@ -0,0 +1,11 @@
+# Generated by roxygen2: do not edit by hand
+
+S3method(print,RTokens)
+export(read)
+export(read_bytes)
+export(read_lines)
+export(read_lines_bytes)
+export(tokenize)
+export(tokenize_file)
+export(tokenize_string)
+useDynLib(sourcetools)
diff --git a/NEWS.md b/NEWS.md
new file mode 100644
index 0000000..c936e21
--- /dev/null
+++ b/NEWS.md
@@ -0,0 +1,36 @@
+# sourcetools 0.1.5
+
+- Ensure that symbols included from e.g. `<cstdio>`, `<cstring>`
+ are resolved using a `std::` prefix.
+# sourcetools 0.1.4
+
+- More work to ensure `sourcetools` can build on Solaris.
+
+# sourcetools 0.1.3
+
+- Relax C++11 requirement, to ensure that `sourcetools` can
+ build on machines with older compilers (e.g. gcc 4.4).
+
+# sourcetools 0.1.2
+
+- Disable failing tests on Solaris.
+
+# sourcetools 0.1.1
+
+- Rename token type `ERR` to `INVALID` to fix build errors
+ on Solaris.
+
+# sourcetools 0.1.0
+
+## Features
+
+The first release of `sourcetools` comes with a small set
+of features exposed to R:
+
+- `read(file)`: Read a file (as a string). Similar to
+ `readChar()`, but faster (and maybe be optimized to
+ use a memory mapped file reader in the future).
+
+- `tokenize_file(file)`: Tokenize an R script.
+
+- `tokenize_string(string)`: Tokenize a string of R code.
diff --git a/R/sourcetools.R b/R/sourcetools.R
new file mode 100644
index 0000000..24c5512
--- /dev/null
+++ b/R/sourcetools.R
@@ -0,0 +1,98 @@
+#' @useDynLib sourcetools
+NULL
+
+#' Read the Contents of a File
+#'
+#' Read the contents of a file into a string (or, in the case of
+#' \code{read_lines}, a vector of strings).
+#'
+#' @param path A file path.
+#'
+#' @name read
+#' @rdname read
+#' @export
+read <- function(path) {
+ path <- normalizePath(path, mustWork = TRUE)
+ .Call("sourcetools_read", path, PACKAGE = "sourcetools")
+}
+
+#' @name read
+#' @rdname read
+#' @export
+read_lines <- function(path) {
+ path <- normalizePath(path, mustWork = TRUE)
+ .Call("sourcetools_read_lines", path, PACKAGE = "sourcetools")
+}
+
+#' @name read
+#' @rdname read
+#' @export
+read_bytes <- function(path) {
+ path <- normalizePath(path, mustWork = TRUE)
+ .Call("sourcetools_read_bytes", path, PACKAGE = "sourcetools")
+}
+
+#' @name read
+#' @rdname read
+#' @export
+read_lines_bytes <- function(path) {
+ path <- normalizePath(path, mustWork = TRUE)
+ .Call("sourcetools_read_lines_bytes", path, PACKAGE = "sourcetools")
+}
+
+#' Tokenize R Code
+#'
+#' Tools for tokenizing \R code.
+#'
+#' @param file,path A file path.
+#' @param text,string \R code as a character vector of length one.
+#'
+#' @note Line numbers are determined by existence of the \code{\\n}
+#' line feed character, under the assumption that code being tokenized
+#' will use either \code{\\n} to indicate newlines (as on modern
+#' Unix systems), or \code{\\r\\n} as on Windows.
+#'
+#' @return A \code{data.frame} with the following columns:
+#'
+#' \tabular{ll}{
+#' \code{value} \tab The token's contents, as a string. \cr
+#' \code{row} \tab The row where the token is located. \cr
+#' \code{column} \tab The column where the token is located. \cr
+#' \code{type} \tab The token type, as a string. \cr
+#' }
+#'
+#' @rdname tokenize-methods
+#' @export
+#' @examples
+#' tokenize_string("x <- 1 + 2")
+tokenize_file <- function(path) {
+ path <- normalizePath(path, mustWork = TRUE)
+ .Call("sourcetools_tokenize_file", path, PACKAGE = "sourcetools")
+}
+
+#' @rdname tokenize-methods
+#' @export
+tokenize_string <- function(string) {
+ .Call("sourcetools_tokenize_string", as.character(string), PACKAGE = "sourcetools")
+}
+
+#' @rdname tokenize-methods
+#' @export
+tokenize <- function(file = "", text = NULL) {
+ if (is.null(text))
+ text <- read(file)
+ tokenize_string(text)
+}
+
+#' @export
+print.RTokens <- function(x, ...) {
+ print.data.frame(x, ...)
+}
+
+parse_string <- function(string) {
+ .Call("sourcetools_parse_string", string, PACKAGE = "sourcetools")
+}
+
+parse_file <- function(file) {
+ parse_string(read(file))
+}
diff --git a/R/util.R b/R/util.R
new file mode 100644
index 0000000..f5536b7
--- /dev/null
+++ b/R/util.R
@@ -0,0 +1,57 @@
+.sourcetools <- new.env(parent = emptyenv())
+.sourcetools$gctorture <- TRUE
+
+with_gctorture <- function(expr) {
+ gctorture(.sourcetools$gctorture)
+ result <- expr
+ gctorture(FALSE)
+ result
+}
+
+check_parse <- function(R, S = R) {
+ lhs <- base::parse(text = R, keep.source = FALSE)
+ rhs <- with_gctorture(parse_string(S))
+ check_parse_impl(lhs, rhs)
+}
+
+check_parse_impl <- function(lhs, rhs) {
+
+ lhsType <- typeof(lhs)
+ rhsType <- typeof(rhs)
+
+ onError <- function(format, ...) {
+ message <- c(
+ sprintf(format, ...),
+ sprintf("R: '%s'", deparse(lhs)),
+ sprintf("S: '%s'", deparse(rhs))
+ )
+ stop(paste(message, collapse = "\n"), call. = FALSE)
+ }
+
+ if (lhsType != rhsType)
+ onError("TypeError: '%s' != '%s'", lhsType, rhsType)
+
+ if (length(lhs) != length(rhs))
+ onError("LengthError: %s != %s", length(lhs), length(rhs))
+
+ if (is.call(lhs) || is.expression(lhs)) {
+ lapply(seq_along(lhs), function(i) {
+ check_parse_impl(lhs[[i]], rhs[[i]])
+ })
+ }
+
+ if (!identical(lhs, rhs))
+ onError("IdenticalError: '%s' != '%s'", lhs, rhs)
+
+ TRUE
+}
+
+expect_parse <- function(R, S = R) {
+ testthat::expect_true(check_parse(R, S))
+}
+
+search_objects <- function() {
+ lapply(seq_along(search()), function(i) {
+ ls(pos = i, all.names = TRUE)
+ })
+}
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..37fab58
--- /dev/null
+++ b/README.md
@@ -0,0 +1,81 @@
+
+
+[![Travis-CI Build Status](https://travis-ci.org/kevinushey/sourcetools.svg?branch=master)](https://travis-ci.org/kevinushey/sourcetools) [![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/kevinushey/sourcetools?branch=master&svg=true)](https://ci.appveyor.com/project/kevinushey/sourcetools)
+
+
+# sourcetools
+
+Tools for reading, tokenizing, and (eventually) parsing `R` code.
+
+## Getting Started
+
+`sourcetools` is not yet on CRAN -- install with
+
+
+```r
+devtools::install_github("kevinushey/sourcetools")
+```
+
+## Reading
+
+`sourcetools` comes with a couple fast functions for reading
+files into `R`.
+
+Use `read()` and `read_lines()` to quickly read a file into
+`R` as character vectors. `read_lines()` handles both Windows
+style `\r\n` line endings, as well as Unix-style `\n` endings.
+
+
+```r
+text <- replicate(10000, paste(sample(letters, 200, TRUE), collapse = ""))
+file <- tempfile()
+cat(text, file = file, sep = "\n")
+mb <- microbenchmark::microbenchmark(times = 10,
+ readChar = readChar(file, file.info(file)$size, TRUE),
+ readLines = readLines(file),
+ read = read(file),
+ read_lines = read_lines(file)
+)
+print(mb, digits = 3)
+```
+
+```
+## Unit: milliseconds
+## expr min lq mean median uq max neval cld
+## readChar 5.2 6.54 10.5 7.02 8.73 36.56 10 ab
+## readLines 155.9 159.69 162.4 161.95 163.15 171.76 10 c
+## read 5.3 5.48 6.5 5.97 7.52 9.35 10 a
+## read_lines 13.5 13.95 14.4 14.09 14.50 16.97 10 b
+```
+
+```r
+unlink(file)
+```
+
+## Tokenization
+
+`sourcetools` provides the `tokenize_string()` and
+`tokenize_file()` functions for generating a tokenized
+representation of R code. These produce 'raw' tokenized
+representations of the code, with each token's value as a
+string, and a recorded row, column, and type:
+
+
+```r
+tokenize_string("if (x < 10) 20")
+```
+
+```
+## value row column type
+## 1 if 1 1 keyword
+## 2 1 3 whitespace
+## 3 ( 1 4 bracket
+## 4 x 1 5 symbol
+## 5 1 6 whitespace
+## 6 < 1 7 operator
+## 7 1 8 whitespace
+## 8 10 1 9 number
+## 9 ) 1 11 bracket
+## 10 1 12 whitespace
+## 11 20 1 13 number
+```
diff --git a/debian/README.test b/debian/README.test
deleted file mode 100644
index 8d70ca3..0000000
--- a/debian/README.test
+++ /dev/null
@@ -1,9 +0,0 @@
-Notes on how this package can be tested.
-────────────────────────────────────────
-
-This package can be tested by running the provided test:
-
-cd tests
-LC_ALL=C R --no-save < testthat.R
-
-in order to confirm its integrity.
diff --git a/debian/changelog b/debian/changelog
deleted file mode 100644
index 9c1f893..0000000
--- a/debian/changelog
+++ /dev/null
@@ -1,5 +0,0 @@
-r-cran-sourcetools (0.1.5-1) unstable; urgency=medium
-
- * Initial release (closes: #842958)
-
- -- Andreas Tille <tille at debian.org> Wed, 02 Nov 2016 17:20:55 +0100
diff --git a/debian/compat b/debian/compat
deleted file mode 100644
index ec63514..0000000
--- a/debian/compat
+++ /dev/null
@@ -1 +0,0 @@
-9
diff --git a/debian/control b/debian/control
deleted file mode 100644
index 45efbe4..0000000
--- a/debian/control
+++ /dev/null
@@ -1,25 +0,0 @@
-Source: r-cran-sourcetools
-Maintainer: Debian Med Packaging Team <debian-med-packaging at lists.alioth.debian.org>
-Uploaders: Andreas Tille <tille at debian.org>
-Section: gnu-r
-Priority: optional
-Build-Depends: debhelper (>= 9),
- dh-r,
- r-base-dev
-Standards-Version: 3.9.8
-Vcs-Browser: https://anonscm.debian.org/viewvc/debian-med/trunk/packages/R/r-cran-sourcetools/trunk/
-Vcs-Svn: svn://anonscm.debian.org/debian-med/trunk/packages/R/r-cran-sourcetools/trunk/
-Homepage: https://cran.r-project.org/package=sourcetools
-
-Package: r-cran-sourcetools
-Architecture: any
-Depends: ${R:Depends},
- ${shlibs:Depends},
- ${misc:Depends}
-Recommends: ${R:Recommends}
-Suggests: ${R:Suggests}
-Description: tools for reading, tokenizing and parsing R code
- Tools for the reading and tokenization of R code. The
- 'sourcetools' package provides both an R and C++ interface for the tokenization
- of R code, and helpers for interacting with the tokenized representation of R
- code.
diff --git a/debian/copyright b/debian/copyright
deleted file mode 100644
index 4decd40..0000000
--- a/debian/copyright
+++ /dev/null
@@ -1,33 +0,0 @@
-Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
-Upstream-Name: sourcetools
-Upstream-Contact: Kevin Ushey <kevinushey at gmail.com>
-Source: https://cran.r-project.org/package=sourcetools
-
-Files: *
-Copyright: 2015-2016 Kevin Ushey
-License: MIT
-
-Files: debian/*
-Copyright: 2016 Andreas Tille <tille at debian.org>
-License: MIT
-
-License: MIT
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- .
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
- .
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
diff --git a/debian/docs b/debian/docs
deleted file mode 100644
index 960011c..0000000
--- a/debian/docs
+++ /dev/null
@@ -1,3 +0,0 @@
-tests
-debian/README.test
-debian/tests/run-unit-test
diff --git a/debian/rules b/debian/rules
deleted file mode 100755
index 529c38a..0000000
--- a/debian/rules
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/usr/bin/make -f
-
-%:
- dh $@ --buildsystem R
-
diff --git a/debian/source/format b/debian/source/format
deleted file mode 100644
index 163aaf8..0000000
--- a/debian/source/format
+++ /dev/null
@@ -1 +0,0 @@
-3.0 (quilt)
diff --git a/debian/tests/control b/debian/tests/control
deleted file mode 100644
index b044b0c..0000000
--- a/debian/tests/control
+++ /dev/null
@@ -1,3 +0,0 @@
-Tests: run-unit-test
-Depends: @, r-cran-testthat
-Restrictions: allow-stderr
diff --git a/debian/tests/run-unit-test b/debian/tests/run-unit-test
deleted file mode 100644
index dca0adf..0000000
--- a/debian/tests/run-unit-test
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/sh -e
-
-oname=sourcetools
-pkg=r-cran-`echo $oname | tr '[A-Z]' '[a-z]'`
-
-if [ "$ADTTMP" = "" ] ; then
- ADTTMP=`mktemp -d /tmp/${pkg}-test.XXXXXX`
- trap "rm -rf $ADTTMP" 0 INT QUIT ABRT PIPE TERM
-fi
-cd $ADTTMP
-cp -a /usr/share/doc/${pkg}/tests/* $ADTTMP
-find . -name "*.gz" -exec gunzip \{\} \;
-LC_ALL=C R --no-save < testthat.R
diff --git a/debian/watch b/debian/watch
deleted file mode 100644
index 3236e50..0000000
--- a/debian/watch
+++ /dev/null
@@ -1,2 +0,0 @@
-version=4
-https://cran.r-project.org/src/contrib/sourcetools_([-\d.]*)\.tar\.gz
diff --git a/inst/include/sourcetools.h b/inst/include/sourcetools.h
new file mode 100644
index 0000000..c10c821
--- /dev/null
+++ b/inst/include/sourcetools.h
@@ -0,0 +1,12 @@
+#ifndef SOURCE_TOOLS_H
+#define SOURCE_TOOLS_H
+
+#include <sourcetools/core/core.h>
+#include <sourcetools/platform/platform.h>
+#include <sourcetools/collection/collection.h>
+#include <sourcetools/utf8/utf8.h>
+#include <sourcetools/cursor/cursor.h>
+#include <sourcetools/read/read.h>
+#include <sourcetools/tokenization/tokenization.h>
+
+#endif
diff --git a/inst/include/sourcetools/collection/Position.h b/inst/include/sourcetools/collection/Position.h
new file mode 100644
index 0000000..a0397a4
--- /dev/null
+++ b/inst/include/sourcetools/collection/Position.h
@@ -0,0 +1,77 @@
+#ifndef SOURCETOOLS_COLLECTION_POSITION_H
+#define SOURCETOOLS_COLLECTION_POSITION_H
+
+#include <ostream>
+#include <cstddef>
+
+namespace sourcetools {
+namespace collections {
+
+struct Position
+{
+ Position()
+ : row(0), column(0)
+ {
+ }
+
+ Position(std::size_t row, std::size_t column)
+ : row(row), column(column)
+ {
+ }
+
+ friend std::ostream& operator<<(std::ostream& os,
+ const Position& position)
+ {
+ os << position.row << ":" << position.column;
+ return os;
+ }
+
+ friend bool operator <(const Position& lhs, const Position& rhs)
+ {
+ return
+ lhs.row < rhs.row ||
+ (lhs.row == rhs.row && lhs.column < rhs.column);
+ }
+
+ friend bool operator <=(const Position& lhs, const Position& rhs)
+ {
+ return
+ lhs.row < rhs.row ||
+ (lhs.row == rhs.row && lhs.column <= rhs.column);
+ }
+
+ friend bool operator ==(const Position& lhs, const Position& rhs)
+ {
+ return
+ lhs.row == rhs.row &&
+ lhs.column == rhs.column;
+ }
+
+ friend bool operator >(const Position& lhs, const Position& rhs)
+ {
+ return
+ lhs.row > rhs.row ||
+ (lhs.row == rhs.row && lhs.column > rhs.column);
+ }
+
+ friend bool operator >=(const Position& lhs, const Position& rhs)
+ {
+ return
+ lhs.row > rhs.row ||
+ (lhs.row == rhs.row && lhs.column >= rhs.column);
+ }
+
+ friend Position operator +(const Position& lhs, std::size_t rhs)
+ {
+ return Position(lhs.row, lhs.column + rhs);
+ }
+
+ std::size_t row;
+ std::size_t column;
+
+};
+
+} // namespace collections
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_COLLECTION_POSITION_H */
diff --git a/inst/include/sourcetools/collection/Range.h b/inst/include/sourcetools/collection/Range.h
new file mode 100644
index 0000000..36f8f61
--- /dev/null
+++ b/inst/include/sourcetools/collection/Range.h
@@ -0,0 +1,34 @@
+#ifndef SOURCETOOLS_COLLECTION_RANGE_H
+#define SOURCETOOLS_COLLECTION_RANGE_H
+
+#include <ostream>
+#include <sourcetools/collection/Position.h>
+
+namespace sourcetools {
+namespace collections {
+
+class Range
+{
+public:
+ Range(const Position& start, const Position& end)
+ : start_(start), end_(end)
+ {
+ }
+
+ friend std::ostream& operator <<(std::ostream& os, const Range& range)
+ {
+ os << "[" << range.start() << "-" << range.end() << "]";
+ return os;
+ }
+
+ const Position start() const { return start_; }
+ const Position end() const { return end_; }
+
+private:
+ Position start_;
+ Position end_;
+};
+} // namespace collections
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_COLLECTION_RANGE_H */
diff --git a/inst/include/sourcetools/collection/collection.h b/inst/include/sourcetools/collection/collection.h
new file mode 100644
index 0000000..68c99e2
--- /dev/null
+++ b/inst/include/sourcetools/collection/collection.h
@@ -0,0 +1,7 @@
+#ifndef SOURCETOOLS_COLLECTION_COLLECTION_H
+#define SOURCETOOLS_COLLECTION_COLLECTION_H
+
+#include <sourcetools/collection/Position.h>
+#include <sourcetools/collection/Range.h>
+
+#endif /* SOURCETOOLS_COLLECTION_COLLECTION_H */
diff --git a/inst/include/sourcetools/core/core.h b/inst/include/sourcetools/core/core.h
new file mode 100644
index 0000000..5b6bed3
--- /dev/null
+++ b/inst/include/sourcetools/core/core.h
@@ -0,0 +1,7 @@
+#ifndef SOURCETOOLS_CORE_CORE_H
+#define SOURCETOOLS_CORE_CORE_H
+
+#include <sourcetools/core/macros.h>
+#include <sourcetools/core/util.h>
+
+#endif /* SOURCETOOLS_CORE_CORE_H */
diff --git a/inst/include/sourcetools/core/macros.h b/inst/include/sourcetools/core/macros.h
new file mode 100644
index 0000000..57c4400
--- /dev/null
+++ b/inst/include/sourcetools/core/macros.h
@@ -0,0 +1,72 @@
+#ifndef SOURCETOOLS_CORE_MACROS_H
+#define SOURCETOOLS_CORE_MACROS_H
+
+#include <cstdio>
+
+#include <string>
+#include <iostream>
+
+/* Utility */
+#ifdef __GNUC__
+# define LIKELY(x) __builtin_expect(!!(x), 1)
+# define UNLIKELY(x) __builtin_expect(!!(x), 0)
+#else
+# define LIKELY(x) x
+# define UNLIKELY(x) x
+#endif
+
+#define SOURCE_TOOLS_CHECK_MASK(__SELF__, __MASK__) \
+ ((__MASK__ & __SELF__) == __MASK__)
+
+#define SOURCE_TOOLS_LOWER_BITS(__VALUE__, __BITS__) \
+ (((1 << __BITS__) - 1) & __VALUE__)
+
+#define SOURCE_TOOLS_PASTE(__X__, __Y__) __X__ ## __Y__
+#define SOURCE_TOOLS_STRINGIFY(__X__) #__X__
+
+/* Logging */
+namespace sourcetools {
+namespace debug {
+
+inline std::string shortFilePath(const std::string& filePath)
+{
+ std::string::size_type index = filePath.find_last_of("/");
+ if (index != std::string::npos)
+ return filePath.substr(index + 1);
+ return filePath;
+}
+
+inline std::string debugPosition(const char* filePath, int line)
+{
+ static const int N = 1024;
+ char buffer[N + 1];
+ std::string shortPath = shortFilePath(filePath);
+ if (shortPath.size() > N / 2)
+ shortPath = shortPath.substr(0, N / 2);
+ std::sprintf(buffer, "[%s:%4i]", shortPath.c_str(), line);
+ return buffer;
+}
+
+} // namespace debug
+} // namespace sourcetools
+
+// Flip on/off as necessary
+#define SOURCE_TOOLS_ENABLE_DEBUG_LOGGING
+
+#ifdef SOURCE_TOOLS_ENABLE_DEBUG_LOGGING
+
+#include <iostream>
+
+#define DEBUG(__X__) \
+ std::cerr << ::sourcetools::debug::debugPosition(__FILE__, __LINE__) \
+ << ": " << __X__ << ::std::endl;
+#define DEBUG_BLOCK(x)
+
+#else
+
+#define DEBUG(x)
+#define DEBUG_BLOCK(x) if (false)
+
+#endif
+
+#endif /* SOURCETOOLS_CORE_MACROS_H */
diff --git a/inst/include/sourcetools/core/util.h b/inst/include/sourcetools/core/util.h
new file mode 100644
index 0000000..8d16cfd
--- /dev/null
+++ b/inst/include/sourcetools/core/util.h
@@ -0,0 +1,142 @@
+#ifndef SOURCETOOLS_CORE_UTIL_H
+#define SOURCETOOLS_CORE_UTIL_H
+
+#include <string>
+#include <memory>
+#include <cctype>
+#include <cstdlib>
+
+namespace sourcetools {
+namespace detail {
+
+class noncopyable
+{
+protected:
+ noncopyable() {}
+ ~noncopyable() {}
+
+private:
+ noncopyable(const noncopyable&);
+ noncopyable& operator=(const noncopyable&);
+};
+
+} // namespace detail
+typedef detail::noncopyable noncopyable;
+
+template <typename T>
+class scoped_ptr : noncopyable
+{
+public:
+ explicit scoped_ptr(T* pData) : pData_(pData) {}
+ T& operator*() const { return *pData_; }
+ T* operator->() const { return pData_; }
+ operator T*() const { return pData_; }
+ ~scoped_ptr() { delete pData_; }
+private:
+ T* pData_;
+};
+
+template <typename T>
+class scoped_array : noncopyable
+{
+public:
+ explicit scoped_array(T* pData) : pData_(pData) {}
+ T& operator*() const { return *pData_; }
+ T* operator->() const { return pData_; }
+ operator T*() const { return pData_; }
+ ~scoped_array() { delete[] pData_; }
+private:
+ T* pData_;
+};
+
+namespace utils {
+
+inline bool isWhitespace(char ch)
+{
+ return
+ ch == ' ' ||
+ ch == '\f' ||
+ ch == '\r' ||
+ ch == '\n' ||
+ ch == '\t' ||
+ ch == '\v';
+}
+
+template <typename T>
+inline bool countWhitespaceBytes(const char* data,
+ T* pBytes)
+{
+ T bytes = 0;
+ while (isWhitespace(*data)) {
+ ++data;
+ ++bytes;
+ }
+
+ *pBytes = bytes;
+ return bytes != 0;
+}
+
+inline bool isDigit(char ch)
+{
+ return
+ (ch >= '0' && ch <= '9');
+}
+
+inline bool isAlphabetic(char ch)
+{
+ return
+ (ch >= 'a' && ch <= 'z') ||
+ (ch >= 'A' && ch <= 'Z');
+}
+
+inline bool isAlphaNumeric(char ch)
+{
+ return
+ (ch >= 'a' && ch <= 'z') ||
+ (ch >= 'A' && ch <= 'Z') ||
+ (ch >= '0' && ch <= '9');
+}
+
+inline bool isHexDigit(char ch)
+{
+ return
+ (ch >= '0' && ch <= '9') ||
+ (ch >= 'a' && ch <= 'f') ||
+ (ch >= 'A' && ch <= 'F');
+}
+
+inline bool isValidForStartOfRSymbol(char ch)
+{
+ return
+ isAlphabetic(ch) ||
+ ch == '.' ||
+ ch < 0;
+}
+
+inline bool isValidForRSymbol(char ch)
+{
+ return
+ isAlphaNumeric(ch) ||
+ ch == '.' ||
+ ch == '_' ||
+ ch < 0;
+}
+
+inline std::string escape(char ch)
+{
+ switch (ch) {
+ case '\r':
+ return "\\r";
+ case '\n':
+ return "\\n";
+ case '\t':
+ return "\\t";
+ default:
+ return std::string(1, ch);
+ }
+}
+
+} // namespace utils
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_CORE_UTIL_H */
diff --git a/inst/include/sourcetools/cursor/TextCursor.h b/inst/include/sourcetools/cursor/TextCursor.h
new file mode 100644
index 0000000..1a5a212
--- /dev/null
+++ b/inst/include/sourcetools/cursor/TextCursor.h
@@ -0,0 +1,66 @@
+#ifndef SOURCETOOLS_CURSOR_TEXT_CURSOR_H
+#define SOURCETOOLS_CURSOR_TEXT_CURSOR_H
+
+#include <string>
+
+#include <sourcetools/core/macros.h>
+#include <sourcetools/collection/Position.h>
+
+namespace sourcetools {
+namespace cursors {
+
+class TextCursor
+{
+public:
+
+ TextCursor(const char* text, std::size_t n)
+ : text_(text),
+ n_(n),
+ offset_(0),
+ position_(0, 0)
+ {
+ }
+
+ char peek(std::size_t offset = 0)
+ {
+ std::size_t index = offset_ + offset;
+ if (UNLIKELY(index >= n_))
+ return '\0';
+ return text_[index];
+ }
+
+ void advance(std::size_t times = 1)
+ {
+ for (std::size_t i = 0; i < times; ++i) {
+ if (peek() == '\n') {
+ ++position_.row;
+ position_.column = 0;
+ } else {
+ ++position_.column;
+ }
+ ++offset_;
+ }
+ }
+
+ operator const char*() const { return text_ + offset_; }
+
+ std::size_t offset() const { return offset_; }
+
+ const collections::Position& position() const { return position_; }
+ std::size_t row() const { return position_.row; }
+ std::size_t column() const { return position_.column; }
+
+ const char* begin() const { return text_; }
+ const char* end() const { return text_ + n_; }
+
+private:
+ const char* text_;
+ std::size_t n_;
+ std::size_t offset_;
+ collections::Position position_;
+};
+
+} // namespace cursors
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_CURSOR_TEXT_CURSOR_H */
diff --git a/inst/include/sourcetools/cursor/TokenCursor.h b/inst/include/sourcetools/cursor/TokenCursor.h
new file mode 100644
index 0000000..e0f3025
--- /dev/null
+++ b/inst/include/sourcetools/cursor/TokenCursor.h
@@ -0,0 +1,321 @@
+#ifndef SOURCETOOLS_CURSOR_TOKEN_CURSOR_H
+#define SOURCETOOLS_CURSOR_TOKEN_CURSOR_H
+
+#include <cstring>
+#include <algorithm>
+
+#include <sourcetools/collection/Position.h>
+#include <sourcetools/tokenization/Token.h>
+
+namespace sourcetools {
+namespace cursors {
+
+class TokenCursor {
+
+private:
+ typedef collections::Position Position;
+ typedef tokens::Token Token;
+
+public:
+
+ TokenCursor(const std::vector<Token>& tokens)
+ : tokens_(tokens),
+ offset_(0),
+ n_(tokens.size()),
+ noSuchToken_(tokens::END)
+ {}
+
+ bool moveToNextToken()
+ {
+ if (UNLIKELY(offset_ >= n_ - 1))
+ return false;
+
+ ++offset_;
+ return true;
+ }
+
+ bool moveToNextSignificantToken()
+ {
+ if (!moveToNextToken())
+ return false;
+
+ if (!fwdOverWhitespaceAndComments())
+ return false;
+
+ return true;
+ }
+
+ bool moveToPreviousToken()
+ {
+ if (UNLIKELY(offset_ == 0))
+ return false;
+
+ --offset_;
+ return true;
+ }
+
+ bool moveToPreviousSignificantToken()
+ {
+ if (!moveToPreviousToken())
+ return false;
+
+ if (!bwdOverWhitespaceAndComments())
+ return false;
+
+ return true;
+ }
+
+ const Token& peekFwd(std::size_t offset = 1) const
+ {
+ std::size_t index = offset_ + offset;
+ if (UNLIKELY(index >= n_))
+ return noSuchToken_;
+
+ return tokens_[index];
+ }
+
+ const Token& peekBwd(std::size_t offset = 1) const
+ {
+ if (UNLIKELY(offset > offset_))
+ return noSuchToken_;
+
+ std::size_t index = offset_ - offset;
+ return tokens_[index];
+ }
+
+ const Token& currentToken() const
+ {
+ if (UNLIKELY(offset_ >= n_))
+ return noSuchToken_;
+ return tokens_[offset_];
+ }
+
+ operator const Token&() const { return currentToken(); }
+
+ bool fwdOverWhitespace()
+ {
+ while (isType(tokens::WHITESPACE))
+ if (!moveToNextToken())
+ return false;
+ return true;
+ }
+
+ bool bwdOverWhitespace()
+ {
+ while (isType(tokens::WHITESPACE))
+ if (!moveToPreviousToken())
+ return false;
+ return true;
+ }
+
+ bool fwdOverComments()
+ {
+ while (isType(tokens::COMMENT))
+ if (!moveToNextToken())
+ return false;
+ return true;
+ }
+
+ bool bwdOverComments()
+ {
+ while (isType(tokens::COMMENT))
+ if (!moveToPreviousToken())
+ return false;
+ return true;
+ }
+
+ bool fwdOverWhitespaceAndComments()
+ {
+ while (isType(tokens::COMMENT) || isType(tokens::WHITESPACE))
+ if (!moveToNextToken())
+ return false;
+ return true;
+ }
+
+ bool bwdOverWhitespaceAndComments()
+ {
+ while (isType(tokens::COMMENT) || isType(tokens::WHITESPACE))
+ if (!moveToPreviousToken())
+ return false;
+ return true;
+ }
+
+ const Token& nextSignificantToken(std::size_t times = 1) const
+ {
+ TokenCursor clone(*this);
+ for (std::size_t i = 0; i < times; ++i)
+ clone.moveToNextSignificantToken();
+ return clone;
+ }
+
+ const Token& previousSignificantToken(std::size_t times = 1) const
+ {
+ TokenCursor clone(*this);
+ for (std::size_t i = 0; i < times; ++i)
+ clone.moveToPreviousSignificantToken();
+ return clone;
+ }
+
+ bool moveToPosition(std::size_t row, std::size_t column)
+ {
+ return moveToPosition(Position(row, column));
+ }
+
+ bool moveToPosition(const Position& target)
+ {
+ if (UNLIKELY(n_ == 0))
+ return false;
+
+ if (UNLIKELY(tokens_[n_ - 1].position() <= target))
+ {
+ offset_ = n_ - 1;
+ return true;
+ }
+
+ std::size_t start = 0;
+ std::size_t end = n_;
+
+ std::size_t offset = 0;
+ while (true)
+ {
+ offset = (start + end) / 2;
+ const Position& current = tokens_[offset].position();
+
+ if (current == target || start == end)
+ break;
+ else if (current < target)
+ start = offset + 1;
+ else
+ end = offset - 1;
+ }
+
+ offset_ = offset;
+ return true;
+ }
+
+ template <typename F>
+ bool findFwd(F f)
+ {
+ do {
+ if (f(this))
+ return true;
+ } while (moveToNextToken());
+
+ return false;
+ }
+
+ template <typename F>
+ bool findBwd(F f)
+ {
+ do {
+ if (f(this))
+ return true;
+ } while (moveToPreviousToken());
+
+ return false;
+ }
+
+ bool findFwd(const char* contents)
+ {
+ return findFwd(std::string(contents, std::strlen(contents)));
+ }
+
+ bool findFwd(const std::string& contents)
+ {
+ do {
+ if (currentToken().contentsEqual(contents))
+ return true;
+ } while (moveToNextToken());
+
+ return false;
+ }
+
+ bool findBwd(const char* contents)
+ {
+ return findBwd(std::string(contents, std::strlen(contents)));
+ }
+
+ bool findBwd(const std::string& contents)
+ {
+ do {
+ if (currentToken().contentsEqual(contents))
+ return true;
+ } while (moveToPreviousToken());
+
+ return false;
+ }
+
+ bool fwdToMatchingBracket()
+ {
+ using namespace tokens;
+ if (!isLeftBracket(currentToken()))
+ return false;
+
+ TokenType lhs = currentToken().type();
+ TokenType rhs = complement(lhs);
+ std::size_t balance = 1;
+
+ while (moveToNextSignificantToken())
+ {
+ TokenType type = currentToken().type();
+ balance += type == lhs;
+ balance -= type == rhs;
+ if (balance == 0) return true;
+ }
+
+ return false;
+ }
+
+ bool bwdToMatchingBracket()
+ {
+ using namespace tokens;
+ if (!isRightBracket(currentToken()))
+ return false;
+
+ TokenType lhs = currentToken().type();
+ TokenType rhs = complement(lhs);
+ std::size_t balance = 1;
+
+ while (moveToPreviousSignificantToken())
+ {
+ TokenType type = currentToken().type();
+ balance += type == lhs;
+ balance -= type == rhs;
+ if (balance == 0) return true;
+ }
+
+ return false;
+ }
+
+ friend std::ostream& operator<<(std::ostream& os, const TokenCursor& cursor)
+ {
+ return os << toString(cursor.currentToken());
+ }
+
+ tokens::TokenType type() const { return currentToken().type(); }
+ bool isType(tokens::TokenType type) const { return currentToken().isType(type); }
+ collections::Position position() const { return currentToken().position(); }
+ std::size_t offset() const { return offset_; }
+ std::size_t row() const { return currentToken().row(); }
+ std::size_t column() const { return currentToken().column(); }
+
+
+private:
+
+ const std::vector<Token>& tokens_;
+ std::size_t offset_;
+ std::size_t n_;
+ Token noSuchToken_;
+
+};
+
+} // namespace cursors
+
+inline std::string toString(const cursors::TokenCursor& cursor)
+{
+ return toString(cursor.currentToken());
+}
+
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_CURSOR_TOKEN_CURSOR_H */
diff --git a/inst/include/sourcetools/cursor/cursor.h b/inst/include/sourcetools/cursor/cursor.h
new file mode 100644
index 0000000..e0b1cdd
--- /dev/null
+++ b/inst/include/sourcetools/cursor/cursor.h
@@ -0,0 +1,7 @@
+#ifndef SOURCETOOLS_CURSOR_CURSOR_H
+#define SOURCETOOLS_CURSOR_CURSOR_H
+
+#include <sourcetools/cursor/TextCursor.h>
+#include <sourcetools/cursor/TokenCursor.h>
+
+#endif /* SOURCETOOLS_CURSOR_CURSOR_H */
diff --git a/inst/include/sourcetools/multibyte/multibyte.h b/inst/include/sourcetools/multibyte/multibyte.h
new file mode 100644
index 0000000..f2c34c4
--- /dev/null
+++ b/inst/include/sourcetools/multibyte/multibyte.h
@@ -0,0 +1,41 @@
+#ifndef SOURCETOOLS_MULTIBYTE_MULTIBYTE_H
+#define SOURCETOOLS_MULTIBYTE_MULTIBYTE_H
+
+#include <cstdlib>
+#include <cwchar>
+
+namespace sourcetools {
+namespace multibyte {
+
+template <typename T>
+inline bool countWhitespaceBytes(const char* data,
+ T* pBytes)
+{
+ wchar_t ch;
+ T bytes = 0;
+ const char* it = data;
+
+ while (true) {
+
+ int status = std::mbtowc(&ch, it, MB_CUR_MAX);
+ if (status == 0) {
+ break;
+ } else if (status == -1) {
+ break;
+ }
+
+ if (!std::iswspace(ch))
+ break;
+
+ bytes += status;
+ it += status;
+ }
+
+ *pBytes = bytes;
+ return bytes != 0;
+}
+
+} // namespace multibyte
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_MULTIBYTE_MULTIBYTE_H */
diff --git a/inst/include/sourcetools/platform/platform.h b/inst/include/sourcetools/platform/platform.h
new file mode 100644
index 0000000..2f6d0c2
--- /dev/null
+++ b/inst/include/sourcetools/platform/platform.h
@@ -0,0 +1,20 @@
+#ifndef SOURCETOOLS_PLATFORM_PLATFORM_H
+#define SOURCETOOLS_PLATFORM_PLATFORM_H
+
+#ifdef _WIN32
+# define SOURCETOOLS_PLATFORM_WINDOWS
+#endif
+
+#ifdef __APPLE__
+# define SOURCETOOLS_PLATFORM_MACOS
+#endif
+
+#ifdef __linux__
+# define SOURCETOOLS_PLATFORM_LINUX
+#endif
+
+#if defined(__sun) && defined(__SVR4)
+# define SOURCETOOLS_PLATFORM_SOLARIS
+#endif
+
+#endif /* SOURCETOOLS_PLATFORM_PLATFORM_H */
diff --git a/inst/include/sourcetools/r/RCallRecurser.h b/inst/include/sourcetools/r/RCallRecurser.h
new file mode 100644
index 0000000..6c55f83
--- /dev/null
+++ b/inst/include/sourcetools/r/RCallRecurser.h
@@ -0,0 +1,75 @@
+#ifndef SOURCETOOLS_R_R_CALL_RECURSER_H
+#define SOURCETOOLS_R_R_CALL_RECURSER_H
+
+#include <vector>
+
+#include <sourcetools/core/core.h>
+
+#include <sourcetools/r/RHeaders.h>
+#include <sourcetools/r/RFunctions.h>
+
+
+namespace sourcetools {
+namespace r {
+
+class CallRecurser : noncopyable
+{
+public:
+
+ class Operation
+ {
+ public:
+ virtual void apply(SEXP dataSEXP) = 0;
+ virtual ~Operation() {}
+ };
+
+ explicit CallRecurser(SEXP dataSEXP)
+ {
+ if (Rf_isPrimitive(dataSEXP))
+ dataSEXP_ = R_NilValue;
+ else if (Rf_isFunction(dataSEXP))
+ dataSEXP_ = r::util::functionBody(dataSEXP);
+ else if (TYPEOF(dataSEXP) == LANGSXP)
+ dataSEXP_ = dataSEXP;
+ else
+ dataSEXP_ = R_NilValue;
+ }
+
+ void add(Operation* pOperation)
+ {
+ operations_.push_back(pOperation);
+ }
+
+ void run()
+ {
+ runImpl(dataSEXP_);
+ }
+
+ void runImpl(SEXP dataSEXP)
+ {
+ for (std::vector<Operation*>::iterator it = operations_.begin();
+ it != operations_.end();
+ ++it)
+ {
+ (*it)->apply(dataSEXP);
+ }
+
+ if (TYPEOF(dataSEXP) == LANGSXP)
+ {
+ while (dataSEXP != R_NilValue)
+ {
+ runImpl(CAR(dataSEXP));
+ dataSEXP = CDR(dataSEXP);
+ }
+ }
+ }
+
+private:
+ SEXP dataSEXP_;
+ std::vector<Operation*> operations_;
+};
+
+} // namespace r
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_R_R_CALL_RECURSER_H */
diff --git a/inst/include/sourcetools/r/RConverter.h b/inst/include/sourcetools/r/RConverter.h
new file mode 100644
index 0000000..7b1b7dd
--- /dev/null
+++ b/inst/include/sourcetools/r/RConverter.h
@@ -0,0 +1,39 @@
+#ifndef SOURCETOOLS_R_R_CONVERTER_H
+#define SOURCETOOLS_R_R_CONVERTER_H
+
+#include <vector>
+#include <string>
+
+#include <sourcetools/r/RUtils.h>
+#include <sourcetools/r/RHeaders.h>
+
+namespace sourcetools {
+namespace r {
+
+inline SEXP Rf_mkChar(const std::string& data)
+{
+ return Rf_mkCharLen(data.c_str(), data.size());
+}
+
+inline SEXP Rf_mkString(const std::string& data)
+{
+ Protect protect;
+ SEXP resultSEXP = protect(Rf_allocVector(STRSXP, 1));
+ SET_STRING_ELT(resultSEXP, 0, Rf_mkChar(data));
+ return resultSEXP;
+}
+
+inline SEXP create(const std::vector<std::string>& vector)
+{
+ Protect protect;
+ std::size_t n = vector.size();
+ SEXP resultSEXP = protect(Rf_allocVector(STRSXP, n));
+ for (std::size_t i = 0; i < n; ++i)
+ SET_STRING_ELT(resultSEXP, i, Rf_mkChar(vector[i]));
+ return resultSEXP;
+}
+
+} // namespace r
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_R_R_CONVERTER_H */
diff --git a/inst/include/sourcetools/r/RFunctions.h b/inst/include/sourcetools/r/RFunctions.h
new file mode 100644
index 0000000..109ab44
--- /dev/null
+++ b/inst/include/sourcetools/r/RFunctions.h
@@ -0,0 +1,85 @@
+#ifndef SOURCETOOLS_R_R_FUNCTIONS_H
+#define SOURCETOOLS_R_R_FUNCTIONS_H
+
+#include <string>
+#include <set>
+
+#include <sourcetools/r/RUtils.h>
+
+namespace sourcetools {
+namespace r {
+
+inline SEXP eval(const std::string& fn, SEXP envSEXP = NULL)
+{
+ Protect protect;
+ if (envSEXP == NULL)
+ {
+ SEXP strSEXP = protect(Rf_mkString("sourcetools"));
+ envSEXP = R_FindNamespace(strSEXP);
+ }
+
+ SEXP callSEXP = protect(Rf_lang1(Rf_install(fn.c_str())));
+ SEXP resultSEXP = protect(Rf_eval(callSEXP, envSEXP));
+ return resultSEXP;
+}
+
+inline std::set<std::string> objectsOnSearchPath()
+{
+ std::set<std::string> results;
+ Protect protect;
+
+ SEXP objectsSEXP;
+ protect(objectsSEXP = eval("search_objects"));
+
+ for (R_xlen_t i = 0; i < Rf_length(objectsSEXP); ++i)
+ {
+ SEXP strSEXP = VECTOR_ELT(objectsSEXP, i);
+ for (R_xlen_t j = 0; j < Rf_length(strSEXP); ++j)
+ {
+ SEXP charSEXP = STRING_ELT(strSEXP, j);
+ std::string element(CHAR(charSEXP), Rf_length(charSEXP));
+ results.insert(element);
+ }
+ }
+
+ return results;
+}
+
+namespace util {
+
+inline void setNames(SEXP dataSEXP, const char** names, std::size_t n)
+{
+ RObjectFactory factory;
+ SEXP namesSEXP = factory.create(STRSXP, n);
+ for (std::size_t i = 0; i < n; ++i)
+ SET_STRING_ELT(namesSEXP, i, Rf_mkChar(names[i]));
+ Rf_setAttrib(dataSEXP, R_NamesSymbol, namesSEXP);
+}
+
+inline void listToDataFrame(SEXP listSEXP, int n)
+{
+ r::Protect protect;
+ SEXP classSEXP = protect(Rf_mkString("data.frame"));
+ Rf_setAttrib(listSEXP, R_ClassSymbol, classSEXP);
+
+ SEXP rownamesSEXP = protect(Rf_allocVector(INTSXP, 2));
+ INTEGER(rownamesSEXP)[0] = NA_INTEGER;
+ INTEGER(rownamesSEXP)[1] = -n;
+ Rf_setAttrib(listSEXP, R_RowNamesSymbol, rownamesSEXP);
+}
+
+inline SEXP functionBody(SEXP fnSEXP)
+{
+ SEXP bodyFunctionSEXP = Rf_findFun(Rf_install("body"), R_BaseNamespace);
+
+ r::Protect protect;
+ SEXP callSEXP = protect(Rf_lang2(bodyFunctionSEXP, fnSEXP));
+ return Rf_eval(callSEXP, R_BaseNamespace);
+}
+
+} // namespace util
+
+} // namespace r
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_R_R_FUNCTIONS_H */
diff --git a/inst/include/sourcetools/r/RHeaders.h b/inst/include/sourcetools/r/RHeaders.h
new file mode 100644
index 0000000..89e2130
--- /dev/null
+++ b/inst/include/sourcetools/r/RHeaders.h
@@ -0,0 +1,8 @@
+#ifndef SOURCETOOLS_R_R_HEADERS_H
+#define SOURCETOOLS_R_R_HEADERS_H
+
+#define R_NO_REMAP
+#include <R.h>
+#include <Rinternals.h>
+
+#endif /* SOURCETOOLS_R_R_HEADERS_H */
diff --git a/inst/include/sourcetools/r/RNonStandardEvaluation.h b/inst/include/sourcetools/r/RNonStandardEvaluation.h
new file mode 100644
index 0000000..cb7f7df
--- /dev/null
+++ b/inst/include/sourcetools/r/RNonStandardEvaluation.h
@@ -0,0 +1,149 @@
+#ifndef SOURCETOOLS_R_R_NON_STANDARD_EVALUATION_H
+#define SOURCETOOLS_R_R_NON_STANDARD_EVALUATION_H
+
+#include <set>
+#include <map>
+
+#include <sourcetools/r/RHeaders.h>
+#include <sourcetools/r/RCallRecurser.h>
+
+namespace sourcetools {
+namespace r {
+namespace nse {
+
+namespace detail {
+
+inline std::set<std::string> makeNsePrimitives()
+{
+ std::set<std::string> instance;
+
+ instance.insert("quote");
+ instance.insert("substitute");
+ instance.insert("eval");
+ instance.insert("evalq");
+ instance.insert("lazy_dots");
+
+ return instance;
+}
+
+inline std::set<std::string>& nsePrimitives()
+{
+ static std::set<std::string> instance = makeNsePrimitives();
+ return instance;
+}
+
+class PerformsNonStandardEvaluationOperation
+ : public r::CallRecurser::Operation
+{
+public:
+
+ PerformsNonStandardEvaluationOperation()
+ : status_(false)
+ {
+ }
+
+ virtual void apply(SEXP dataSEXP)
+ {
+ if (status_ || TYPEOF(dataSEXP) != LANGSXP)
+ return;
+
+ if ((status_ = checkCall(dataSEXP)))
+ return;
+
+ SEXP fnSEXP = CAR(dataSEXP);
+ if (TYPEOF(fnSEXP) == SYMSXP)
+ status_ = nsePrimitives().count(CHAR(PRINTNAME(fnSEXP)));
+ else if (TYPEOF(fnSEXP) == STRSXP)
+ status_ = nsePrimitives().count(CHAR(STRING_ELT(fnSEXP, 0)));
+
+ }
+
+ bool status() const { return status_; }
+
+private:
+
+ bool checkCall(SEXP callSEXP)
+ {
+ std::size_t n = Rf_length(callSEXP);
+ if (n == 0)
+ return false;
+
+ SEXP fnSEXP = CAR(callSEXP);
+ if (fnSEXP == Rf_install("::") || fnSEXP == Rf_install(":::"))
+ {
+ SEXP lhsSEXP = CADR(callSEXP);
+ SEXP rhsSEXP = CADDR(callSEXP);
+
+ if (lhsSEXP == Rf_install("lazyeval") && rhsSEXP == Rf_install("lazy_dots"))
+ return true;
+ }
+
+ return false;
+ }
+
+private:
+ bool status_;
+};
+
+} // namespace detail
+
+class Database
+{
+public:
+ bool check(SEXP dataSEXP)
+ {
+ if (contains(dataSEXP))
+ return get(dataSEXP);
+
+ typedef detail::PerformsNonStandardEvaluationOperation Operation;
+ scoped_ptr<Operation> operation(new Operation);
+
+ r::CallRecurser recurser(dataSEXP);
+ recurser.add(operation);
+ recurser.run();
+
+ set(dataSEXP, operation->status());
+ return operation->status();
+ }
+
+private:
+
+ bool contains(SEXP dataSEXP)
+ {
+ return map_.count(address(dataSEXP));
+ }
+
+ bool get(SEXP dataSEXP)
+ {
+ return map_[address(dataSEXP)];
+ }
+
+ void set(SEXP dataSEXP, bool value)
+ {
+ map_[address(dataSEXP)] = value;
+ }
+
+ std::size_t address(SEXP dataSEXP)
+ {
+ return reinterpret_cast<std::size_t>(dataSEXP);
+ }
+
+ std::map<std::size_t, bool> map_;
+};
+
+inline Database& database()
+{
+ static Database instance;
+ return instance;
+}
+
+inline bool performsNonStandardEvaluation(SEXP fnSEXP)
+{
+ return database().check(fnSEXP);
+}
+
+} // namespace nse
+} // namespace r
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_R_R_NON_STANDARD_EVALUATION_H */
diff --git a/inst/include/sourcetools/r/RUtils.h b/inst/include/sourcetools/r/RUtils.h
new file mode 100644
index 0000000..8fcc9e4
--- /dev/null
+++ b/inst/include/sourcetools/r/RUtils.h
@@ -0,0 +1,100 @@
+#ifndef SOURCETOOLS_R_R_UTILS_H
+#define SOURCETOOLS_R_R_UTILS_H
+
+#include <vector>
+
+#include <sourcetools/core/core.h>
+
+#include <sourcetools/r/RHeaders.h>
+
+namespace sourcetools {
+namespace r {
+
+class Protect : noncopyable
+{
+public:
+ Protect(): n_(0) {}
+ ~Protect() { UNPROTECT(n_); }
+
+ SEXP operator()(SEXP objectSEXP)
+ {
+ ++n_;
+ return PROTECT(objectSEXP);
+ }
+
+private:
+ int n_;
+};
+
+class RObjectFactory : noncopyable
+{
+public:
+
+ RObjectFactory()
+ : n_(0)
+ {
+ }
+
+ template <typename T, typename F>
+ SEXP create(SEXPTYPE type, const std::vector<T>& vector, F f)
+ {
+ ++n_;
+ std::size_t n = vector.size();
+ SEXP resultSEXP = PROTECT(Rf_allocVector(type, n));
+ for (std::size_t i = 0; i < n; ++i)
+ f(resultSEXP, i, vector[i]);
+ return resultSEXP;
+ }
+
+ SEXP create(SEXPTYPE type, std::size_t n)
+ {
+ ++n_;
+ return PROTECT(Rf_allocVector(type, n));
+ }
+
+ ~RObjectFactory()
+ {
+ UNPROTECT(n_);
+ }
+
+private:
+ std::size_t n_;
+};
+
+class ListBuilder : noncopyable
+{
+public:
+
+ void add(const std::string& name, SEXP value)
+ {
+ names_.push_back(name);
+ data_.push_back(protect_(value));
+ }
+
+ operator SEXP() const
+ {
+ std::size_t n = data_.size();
+
+ SEXP resultSEXP = protect_(Rf_allocVector(VECSXP, n));
+ SEXP namesSEXP = protect_(Rf_allocVector(STRSXP, n));
+
+ for (std::size_t i = 0; i < n; ++i)
+ {
+ SET_VECTOR_ELT(resultSEXP, i, data_[i]);
+ SET_STRING_ELT(namesSEXP, i, Rf_mkCharLen(names_[i].c_str(), names_[i].size()));
+ }
+
+ Rf_setAttrib(resultSEXP, R_NamesSymbol, namesSEXP);
+ return resultSEXP;
+ }
+
+private:
+ std::vector<std::string> names_;
+ std::vector<SEXP> data_;
+ mutable Protect protect_;
+};
+
+} // namespace r
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_R_R_UTILS_H */
diff --git a/inst/include/sourcetools/r/r.h b/inst/include/sourcetools/r/r.h
new file mode 100644
index 0000000..1076272
--- /dev/null
+++ b/inst/include/sourcetools/r/r.h
@@ -0,0 +1,11 @@
+#ifndef SOURCETOOLS_R_R_H
+#define SOURCETOOLS_R_R_H
+
+#include <sourcetools/r/RHeaders.h>
+#include <sourcetools/r/RUtils.h>
+#include <sourcetools/r/RConverter.h>
+#include <sourcetools/r/RFunctions.h>
+#include <sourcetools/r/RCallRecurser.h>
+#include <sourcetools/r/RNonStandardEvaluation.h>
+
+#endif /* SOURCETOOLS_R_R_H */
diff --git a/inst/include/sourcetools/read/MemoryMappedReader.h b/inst/include/sourcetools/read/MemoryMappedReader.h
new file mode 100644
index 0000000..a541379
--- /dev/null
+++ b/inst/include/sourcetools/read/MemoryMappedReader.h
@@ -0,0 +1,139 @@
+#ifndef SOURCETOOLS_READ_MEMORY_MAPPED_READER_H
+#define SOURCETOOLS_READ_MEMORY_MAPPED_READER_H
+
+#include <vector>
+#include <string>
+#include <algorithm>
+
+#include <sourcetools/core/macros.h>
+
+#include <sourcetools/r/RHeaders.h>
+#include <sourcetools/r/RUtils.h>
+
+#ifndef _WIN32
+# include <sourcetools/read/posix/FileConnection.h>
+# include <sourcetools/read/posix/MemoryMappedConnection.h>
+#else
+# include <sourcetools/read/windows/FileConnection.h>
+# include <sourcetools/read/windows/MemoryMappedConnection.h>
+#endif
+
+namespace sourcetools {
+namespace detail {
+
+class MemoryMappedReader
+{
+public:
+
+ class VectorReader
+ {
+ public:
+
+ explicit VectorReader(std::vector<std::string>* pData)
+ : pData_(pData)
+ {
+ }
+
+ template <typename T>
+ void operator()(const T& lhs, const T& rhs)
+ {
+ pData_->push_back(std::string(lhs, rhs));
+ }
+
+ private:
+ std::vector<std::string>* pData_;
+ };
+
+ static bool read(const char* path, std::string* pContent)
+ {
+ // Open file connection
+ FileConnection conn(path);
+ if (!conn.open())
+ return false;
+
+ // Get size of file
+ std::size_t size;
+ if (!conn.size(&size))
+ return false;
+
+ // Early return for empty files
+ if (UNLIKELY(size == 0))
+ return true;
+
+ // mmap the file
+ MemoryMappedConnection map(conn, size);
+ if (!map.open())
+ return false;
+
+ pContent->assign(map, size);
+ return true;
+ }
+
+ template <typename F>
+ static bool read_lines(const char* path, F f)
+ {
+ FileConnection conn(path);
+ if (!conn.open())
+ return false;
+
+ // Get size of file
+ std::size_t size;
+ if (!conn.size(&size))
+ return false;
+
+ // Early return for empty files
+ if (UNLIKELY(size == 0))
+ return true;
+
+ // mmap the file
+ MemoryMappedConnection map(conn, size);
+ if (!map.open())
+ return false;
+
+ // special case: just a '\n'
+ bool endsWithNewline = map[size - 1] == '\n';
+ if (size == 1 && endsWithNewline)
+ return true;
+
+ // Search for newlines
+ const char* lower = map;
+ const char* upper = map;
+ const char* end = map + size;
+ while (true)
+ {
+ upper = std::find(lower, end, '\n');
+ if (upper == end)
+ break;
+
+ // Handle '\r\n'
+ int CR = *(upper - 1) == '\r';
+ upper -= CR;
+
+ // Pass to functor
+ f(lower, upper);
+
+ // Update
+ lower = upper + 1 + CR;
+ }
+
+ // If this file ended with a newline, we're done
+ if (endsWithNewline)
+ return true;
+
+ // Otherwise, consume one more string, then we're done
+ f(lower, end);
+ return true;
+ }
+
+ static bool read_lines(const char* path, std::vector<std::string>* pContent)
+ {
+ VectorReader reader(pContent);
+ return read_lines(path, reader);
+ }
+
+};
+
+} // namespace detail
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_READ_MEMORY_MAPPED_READER_H */
diff --git a/inst/include/sourcetools/read/posix/FileConnection.h b/inst/include/sourcetools/read/posix/FileConnection.h
new file mode 100644
index 0000000..eaf5072
--- /dev/null
+++ b/inst/include/sourcetools/read/posix/FileConnection.h
@@ -0,0 +1,58 @@
+#ifndef SOURCETOOLS_READ_POSIX_FILE_CONNECTION_H
+#define SOURCETOOLS_READ_POSIX_FILE_CONNECTION_H
+
+#include <cstddef>
+
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+namespace sourcetools {
+namespace detail {
+
+class FileConnection
+{
+public:
+
+ typedef int FileDescriptor;
+
+ FileConnection(const char* path, int flags = O_RDONLY)
+ {
+ fd_ = ::open(path, flags);
+ }
+
+ ~FileConnection()
+ {
+ if (open())
+ ::close(fd_);
+ }
+
+ bool open()
+ {
+ return fd_ != -1;
+ }
+
+ bool size(std::size_t* pSize)
+ {
+ struct stat info;
+ if (::fstat(fd_, &info) == -1)
+ return false;
+
+ *pSize = info.st_size;
+ return true;
+ }
+
+ operator FileDescriptor() const
+ {
+ return fd_;
+ }
+
+private:
+ FileDescriptor fd_;
+};
+
+
+} // namespace detail
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_READ_POSIX_FILE_CONNECTION_H */
diff --git a/inst/include/sourcetools/read/posix/MemoryMappedConnection.h b/inst/include/sourcetools/read/posix/MemoryMappedConnection.h
new file mode 100644
index 0000000..5782ce2
--- /dev/null
+++ b/inst/include/sourcetools/read/posix/MemoryMappedConnection.h
@@ -0,0 +1,55 @@
+#ifndef SOURCETOOLS_READ_POSIX_MEMORY_MAPPED_CONNECTION_H
+#define SOURCETOOLS_READ_POSIX_MEMORY_MAPPED_CONNECTION_H
+
+#include <cstdlib>
+#include <fcntl.h>
+#include <sys/mman.h>
+
+#include <sourcetools/platform/platform.h>
+
+namespace sourcetools {
+namespace detail {
+
+class MemoryMappedConnection
+{
+public:
+
+ MemoryMappedConnection(int fd, std::size_t size)
+ : size_(size)
+ {
+#ifdef MAP_POPULATE
+ map_ = (char*) ::mmap(0, size, PROT_READ, MAP_SHARED | MAP_POPULATE, fd, 0);
+#else
+ map_ = (char*) ::mmap(0, size, PROT_READ, MAP_SHARED, fd, 0);
+#endif
+
+#if defined(POSIX_MADV_SEQUENTIAL) && defined(POSIX_MADV_WILLNEED)
+ ::posix_madvise((void*) map_, size, POSIX_MADV_SEQUENTIAL | POSIX_MADV_WILLNEED);
+#endif
+ }
+
+ ~MemoryMappedConnection()
+ {
+ if (map_ != MAP_FAILED)
+ ::munmap(map_, size_);
+ }
+
+ bool open()
+ {
+ return map_ != MAP_FAILED;
+ }
+
+ operator char*() const
+ {
+ return map_;
+ }
+
+private:
+ char* map_;
+ std::size_t size_;
+};
+
+} // namespace detail
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_READ_POSIX_MEMORY_MAPPED_CONNECTION_H */
diff --git a/inst/include/sourcetools/read/read.h b/inst/include/sourcetools/read/read.h
new file mode 100644
index 0000000..e5074f8
--- /dev/null
+++ b/inst/include/sourcetools/read/read.h
@@ -0,0 +1,24 @@
+#ifndef SOURCETOOLS_READ_READ_H
+#define SOURCETOOLS_READ_READ_H
+
+#include <vector>
+#include <string>
+
+#include <sourcetools/read/MemoryMappedReader.h>
+
+namespace sourcetools {
+
+inline bool read(const std::string& absolutePath, std::string* pContent)
+{
+ return detail::MemoryMappedReader::read(absolutePath.c_str(), pContent);
+}
+
+inline bool read_lines(const std::string& absolutePath,
+ std::vector<std::string>* pLines)
+{
+ return detail::MemoryMappedReader::read_lines(absolutePath.c_str(), pLines);
+}
+
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_READ_READ_H */
diff --git a/inst/include/sourcetools/read/windows/FileConnection.h b/inst/include/sourcetools/read/windows/FileConnection.h
new file mode 100644
index 0000000..de3c346
--- /dev/null
+++ b/inst/include/sourcetools/read/windows/FileConnection.h
@@ -0,0 +1,50 @@
+#ifndef SOURCETOOLS_READ_WINDOWS_FILE_CONNECTION_H
+#define SOURCETOOLS_READ_WINDOWS_FILE_CONNECTION_H
+
+#undef Realloc
+#undef Free
+#include <windows.h>
+
+namespace sourcetools {
+namespace detail {
+
+class FileConnection
+{
+public:
+ typedef HANDLE FileDescriptor;
+
+ FileConnection(const char* path, int flags = GENERIC_READ)
+ {
+ handle_ = ::CreateFile(path, flags, FILE_SHARE_READ, NULL, OPEN_EXISTING, 0, NULL);
+ }
+
+ ~FileConnection()
+ {
+ if (open())
+ ::CloseHandle(handle_);
+ }
+
+ bool open()
+ {
+ return handle_ != INVALID_HANDLE_VALUE;
+ }
+
+ bool size(std::size_t* pSize)
+ {
+ *pSize = ::GetFileSize(handle_, NULL);
+ return true;
+ }
+
+ operator FileDescriptor() const
+ {
+ return handle_;
+ }
+
+private:
+ FileDescriptor handle_;
+};
+
+} // namespace detail
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_READ_WINDOWS_FILE_CONNECTION_H */
diff --git a/inst/include/sourcetools/read/windows/MemoryMappedConnection.h b/inst/include/sourcetools/read/windows/MemoryMappedConnection.h
new file mode 100644
index 0000000..0885e3b
--- /dev/null
+++ b/inst/include/sourcetools/read/windows/MemoryMappedConnection.h
@@ -0,0 +1,51 @@
+#ifndef SOURCETOOLS_READ_WINDOWS_MEMORY_MAPPED_CONNECTION_H
+#define SOURCETOOLS_READ_WINDOWS_MEMORY_MAPPED_CONNECTION_H
+
+#undef Realloc
+#undef Free
+#include <windows.h>
+
+namespace sourcetools {
+namespace detail {
+
+class MemoryMappedConnection
+{
+public:
+
+ MemoryMappedConnection(HANDLE handle, std::size_t size)
+ : map_(NULL), size_(size)
+ {
+ handle_ = ::CreateFileMapping(handle, NULL, PAGE_READONLY, 0, 0, NULL);
+ if (handle_ == NULL)
+ return;
+
+ map_ = (char*) ::MapViewOfFile(handle_, FILE_MAP_READ, 0, 0, size);
+ }
+
+ ~MemoryMappedConnection()
+ {
+ if (handle_ != INVALID_HANDLE_VALUE)
+ ::CloseHandle(handle_);
+ }
+
+ bool open()
+ {
+ return map_ != NULL;
+ }
+
+ operator char*() const
+ {
+ return map_;
+ }
+
+private:
+ char* map_;
+ std::size_t size_;
+
+ HANDLE handle_;
+};
+
+} // namespace detail
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_READ_WINDOWS_MEMORY_MAPPED_CONNECTION_H */
diff --git a/inst/include/sourcetools/tests/testthat.h b/inst/include/sourcetools/tests/testthat.h
new file mode 100644
index 0000000..7ed58f5
--- /dev/null
+++ b/inst/include/sourcetools/tests/testthat.h
@@ -0,0 +1,14 @@
+#ifndef SOURCETOOLS_TESTS_TESTTHAT_H
+#define SOURCETOOLS_TESTS_TESTTHAT_H
+
+// disable testthat with older gcc
+#if defined(__GNUC__) && defined(__GNUC_MINOR__) && !defined(__clang__)
+# if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 6)
+# define TESTTHAT_DISABLED
+# endif
+#endif
+
+// include testthat.h
+#include <testthat.h>
+
+#endif /* SOURCETOOLS_TESTS_TESTTHAT_H */
diff --git a/inst/include/sourcetools/tokenization/Registration.h b/inst/include/sourcetools/tokenization/Registration.h
new file mode 100644
index 0000000..9a44fee
--- /dev/null
+++ b/inst/include/sourcetools/tokenization/Registration.h
@@ -0,0 +1,190 @@
+#ifndef SOURCETOOLS_TOKENIZATION_REGISTRATION_H
+#define SOURCETOOLS_TOKENIZATION_REGISTRATION_H
+
+#include <string>
+#include <cstring>
+#include <cstdlib>
+
+namespace sourcetools {
+namespace tokens {
+
+typedef unsigned int TokenType;
+
+// Simple, non-nestable types.
+#define SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(__NAME__, __TYPE__) \
+ static const TokenType __NAME__ = __TYPE__
+
+SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(INVALID, (1 << 31));
+SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(END, (1 << 30));
+SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(EMPTY, (1 << 29));
+SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(MISSING, (1 << 28));
+SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(ROOT, (1 << 27));
+SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(SEMI, (1 << 26));
+SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(COMMA, (1 << 25));
+SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(SYMBOL, (1 << 24));
+SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(COMMENT, (1 << 23));
+SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(WHITESPACE, (1 << 22));
+SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(STRING, (1 << 21));
+SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(NUMBER, (1 << 20));
+
+/* Brackets */
+#define SOURCE_TOOLS_BRACKET_BIT (1 << 19)
+#define SOURCE_TOOLS_BRACKET_RIGHT_BIT (1 << 5)
+#define SOURCE_TOOLS_BRACKET_LEFT_BIT (1 << 4)
+#define SOURCE_TOOLS_BRACKET_MASK SOURCE_TOOLS_BRACKET_BIT
+#define SOURCE_TOOLS_BRACKET_LEFT_MASK (SOURCE_TOOLS_BRACKET_BIT | SOURCE_TOOLS_BRACKET_LEFT_BIT)
+#define SOURCE_TOOLS_BRACKET_RIGHT_MASK (SOURCE_TOOLS_BRACKET_BIT | SOURCE_TOOLS_BRACKET_RIGHT_BIT)
+
+#define SOURCE_TOOLS_REGISTER_BRACKET(__NAME__, __SIDE__, __INDEX__) \
+ static const TokenType __NAME__ = \
+ SOURCE_TOOLS_BRACKET_BIT | __SIDE__ | __INDEX__
+
+SOURCE_TOOLS_REGISTER_BRACKET(LPAREN, SOURCE_TOOLS_BRACKET_LEFT_BIT, (1 << 0));
+SOURCE_TOOLS_REGISTER_BRACKET(LBRACE, SOURCE_TOOLS_BRACKET_LEFT_BIT, (1 << 1));
+SOURCE_TOOLS_REGISTER_BRACKET(LBRACKET, SOURCE_TOOLS_BRACKET_LEFT_BIT, (1 << 2));
+SOURCE_TOOLS_REGISTER_BRACKET(LDBRACKET, SOURCE_TOOLS_BRACKET_LEFT_BIT, (1 << 3));
+
+SOURCE_TOOLS_REGISTER_BRACKET(RPAREN, SOURCE_TOOLS_BRACKET_RIGHT_BIT, (1 << 0));
+SOURCE_TOOLS_REGISTER_BRACKET(RBRACE, SOURCE_TOOLS_BRACKET_RIGHT_BIT, (1 << 1));
+SOURCE_TOOLS_REGISTER_BRACKET(RBRACKET, SOURCE_TOOLS_BRACKET_RIGHT_BIT, (1 << 2));
+SOURCE_TOOLS_REGISTER_BRACKET(RDBRACKET, SOURCE_TOOLS_BRACKET_RIGHT_BIT, (1 << 3));
+
+/* Operators */
+#define SOURCE_TOOLS_OPERATOR_BIT (1 << 18)
+#define SOURCE_TOOLS_OPERATOR_UNARY_BIT (1 << 6)
+#define SOURCE_TOOLS_OPERATOR_MASK (SOURCE_TOOLS_OPERATOR_BIT)
+#define SOURCE_TOOLS_OPERATOR_UNARY_MASK (SOURCE_TOOLS_OPERATOR_MASK | SOURCE_TOOLS_OPERATOR_UNARY_BIT)
+
+#define SOURCE_TOOLS_REGISTER_OPERATOR(__NAME__, __STRING__, __MASKS__) \
+ \
+ static const TokenType OPERATOR_ ## __NAME__ = \
+ SOURCE_TOOLS_OPERATOR_BIT | __MASKS__; \
+ \
+ static const char* const \
+ OPERATOR_ ## __NAME__ ## _STRING = __STRING__
+
+#define SOURCE_TOOLS_REGISTER_UNARY_OPERATOR(__NAME__, __STRING__, __INDEX__) \
+ SOURCE_TOOLS_REGISTER_OPERATOR(__NAME__, __STRING__, SOURCE_TOOLS_OPERATOR_UNARY_BIT | __INDEX__)
+
+// See ?"Syntax" for details on R's operators.
+// Note: All operators registered work in a binary context, but only
+// some will work as unary operators. (Occurring to the left of the token).
+//
+// In other words, -1 is parsed as `-`(1).
+//
+// Note that although brackets are operators we tokenize them separately,
+// since we need to later check for their paired complement.
+SOURCE_TOOLS_REGISTER_UNARY_OPERATOR(PLUS, "+", 0);
+SOURCE_TOOLS_REGISTER_UNARY_OPERATOR(MINUS, "-", 1);
+SOURCE_TOOLS_REGISTER_UNARY_OPERATOR(HELP, "?", 2);
+SOURCE_TOOLS_REGISTER_UNARY_OPERATOR(NEGATION, "!", 3);
+SOURCE_TOOLS_REGISTER_UNARY_OPERATOR(FORMULA, "~", 4);
+
+SOURCE_TOOLS_REGISTER_OPERATOR(NAMESPACE_EXPORTS, "::", 5);
+SOURCE_TOOLS_REGISTER_OPERATOR(NAMESPACE_ALL, ":::", 6);
+SOURCE_TOOLS_REGISTER_OPERATOR(DOLLAR, "$", 7);
+SOURCE_TOOLS_REGISTER_OPERATOR(AT, "@", 8);
+SOURCE_TOOLS_REGISTER_OPERATOR(HAT, "^", 9);
+SOURCE_TOOLS_REGISTER_OPERATOR(EXPONENTATION_STARS, "**", 10);
+SOURCE_TOOLS_REGISTER_OPERATOR(SEQUENCE, ":", 11);
+SOURCE_TOOLS_REGISTER_OPERATOR(MULTIPLY, "*", 12);
+SOURCE_TOOLS_REGISTER_OPERATOR(DIVIDE, "/", 13);
+SOURCE_TOOLS_REGISTER_OPERATOR(LESS, "<", 14);
+SOURCE_TOOLS_REGISTER_OPERATOR(LESS_OR_EQUAL, "<=", 15);
+SOURCE_TOOLS_REGISTER_OPERATOR(GREATER, ">", 16);
+SOURCE_TOOLS_REGISTER_OPERATOR(GREATER_OR_EQUAL, ">=", 17);
+SOURCE_TOOLS_REGISTER_OPERATOR(EQUAL, "==", 18);
+SOURCE_TOOLS_REGISTER_OPERATOR(NOT_EQUAL, "!=", 19);
+SOURCE_TOOLS_REGISTER_OPERATOR(AND_VECTOR, "&", 20);
+SOURCE_TOOLS_REGISTER_OPERATOR(AND_SCALAR, "&&", 21);
+SOURCE_TOOLS_REGISTER_OPERATOR(OR_VECTOR, "|", 22);
+SOURCE_TOOLS_REGISTER_OPERATOR(OR_SCALAR, "||", 23);
+SOURCE_TOOLS_REGISTER_OPERATOR(ASSIGN_LEFT, "<-", 24);
+SOURCE_TOOLS_REGISTER_OPERATOR(ASSIGN_LEFT_PARENT, "<<-", 25);
+SOURCE_TOOLS_REGISTER_OPERATOR(ASSIGN_RIGHT, "->", 26);
+SOURCE_TOOLS_REGISTER_OPERATOR(ASSIGN_RIGHT_PARENT, "->>", 27);
+SOURCE_TOOLS_REGISTER_OPERATOR(ASSIGN_LEFT_EQUALS, "=", 28);
+SOURCE_TOOLS_REGISTER_OPERATOR(ASSIGN_LEFT_COLON, ":=", 29);
+SOURCE_TOOLS_REGISTER_OPERATOR(USER, "%%", 30);
+
+/* Keywords and symbols */
+#define SOURCE_TOOLS_KEYWORD_BIT (1 << 17)
+#define SOURCE_TOOLS_KEYWORD_CONTROL_FLOW_BIT (1 << 7)
+#define SOURCE_TOOLS_KEYWORD_MASK SOURCE_TOOLS_KEYWORD_BIT
+#define SOURCE_TOOLS_KEYWORD_CONTROL_FLOW_MASK (SOURCE_TOOLS_KEYWORD_MASK | SOURCE_TOOLS_KEYWORD_CONTROL_FLOW_BIT)
+
+#define SOURCE_TOOLS_REGISTER_KEYWORD(__NAME__, __MASKS__) \
+ static const TokenType KEYWORD_ ## __NAME__ = \
+ __MASKS__ | SOURCE_TOOLS_KEYWORD_MASK
+
+#define SOURCE_TOOLS_REGISTER_CONTROL_FLOW_KEYWORD(__NAME__, __MASKS__) \
+ SOURCE_TOOLS_REGISTER_KEYWORD(__NAME__, __MASKS__ | SOURCE_TOOLS_KEYWORD_CONTROL_FLOW_MASK)
+
+// See '?Reserved' for a list of reversed R symbols.
+SOURCE_TOOLS_REGISTER_CONTROL_FLOW_KEYWORD(IF, 1);
+SOURCE_TOOLS_REGISTER_CONTROL_FLOW_KEYWORD(FOR, 2);
+SOURCE_TOOLS_REGISTER_CONTROL_FLOW_KEYWORD(WHILE, 3);
+SOURCE_TOOLS_REGISTER_CONTROL_FLOW_KEYWORD(REPEAT, 4);
+SOURCE_TOOLS_REGISTER_CONTROL_FLOW_KEYWORD(FUNCTION, 5);
+
+SOURCE_TOOLS_REGISTER_KEYWORD(ELSE, 6);
+SOURCE_TOOLS_REGISTER_KEYWORD(IN, 7);
+SOURCE_TOOLS_REGISTER_KEYWORD(NEXT, 8);
+SOURCE_TOOLS_REGISTER_KEYWORD(BREAK, 9);
+SOURCE_TOOLS_REGISTER_KEYWORD(TRUE, 10);
+SOURCE_TOOLS_REGISTER_KEYWORD(FALSE, 11);
+SOURCE_TOOLS_REGISTER_KEYWORD(NULL, 12);
+SOURCE_TOOLS_REGISTER_KEYWORD(Inf, 13);
+SOURCE_TOOLS_REGISTER_KEYWORD(NaN, 14);
+SOURCE_TOOLS_REGISTER_KEYWORD(NA, 15);
+SOURCE_TOOLS_REGISTER_KEYWORD(NA_integer_, 16);
+SOURCE_TOOLS_REGISTER_KEYWORD(NA_real_, 17);
+SOURCE_TOOLS_REGISTER_KEYWORD(NA_complex_, 18);
+SOURCE_TOOLS_REGISTER_KEYWORD(NA_character_, 19);
+
+inline TokenType symbolType(const char* string, std::size_t n)
+{
+ // TODO: Is this insanity really an optimization or am I just silly?
+ if (n < 2 || n > 13) {
+ return SYMBOL;
+ } else if (n == 2) {
+ if (!std::memcmp(string, "in", n)) return KEYWORD_IN;
+ if (!std::memcmp(string, "if", n)) return KEYWORD_IF;
+ if (!std::memcmp(string, "NA", n)) return KEYWORD_NA;
+ } else if (n == 3) {
+ if (!std::memcmp(string, "for", n)) return KEYWORD_FOR;
+ if (!std::memcmp(string, "Inf", n)) return KEYWORD_Inf;
+ if (!std::memcmp(string, "NaN", n)) return KEYWORD_NaN;
+ } else if (n == 4) {
+ if (!std::memcmp(string, "else", n)) return KEYWORD_ELSE;
+ if (!std::memcmp(string, "next", n)) return KEYWORD_NEXT;
+ if (!std::memcmp(string, "TRUE", n)) return KEYWORD_TRUE;
+ if (!std::memcmp(string, "NULL", n)) return KEYWORD_NULL;
+ } else if (n == 5) {
+ if (!std::memcmp(string, "while", n)) return KEYWORD_WHILE;
+ if (!std::memcmp(string, "break", n)) return KEYWORD_BREAK;
+ if (!std::memcmp(string, "FALSE", n)) return KEYWORD_FALSE;
+ } else if (n == 6) {
+ if (!std::memcmp(string, "repeat", n)) return KEYWORD_REPEAT;
+ } else if (n == 8) {
+ if (!std::memcmp(string, "function", n)) return KEYWORD_FUNCTION;
+ if (!std::memcmp(string, "NA_real_", n)) return KEYWORD_NA_real_;
+ } else if (n == 11) {
+ if (!std::memcmp(string, "NA_integer_", n)) return KEYWORD_NA_integer_;
+ if (!std::memcmp(string, "NA_complex_", n)) return KEYWORD_NA_complex_;
+ } else if (n == 13) {
+ if (!std::memcmp(string, "NA_character_", n)) return KEYWORD_NA_character_;
+ }
+
+ return SYMBOL;
+}
+
+inline TokenType symbolType(const std::string& symbol)
+{
+ return symbolType(symbol.data(), symbol.size());
+}
+
+} // namespace tokens
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_TOKENIZATION_REGISTRATION_H */
diff --git a/inst/include/sourcetools/tokenization/Token.h b/inst/include/sourcetools/tokenization/Token.h
new file mode 100644
index 0000000..fff05f3
--- /dev/null
+++ b/inst/include/sourcetools/tokenization/Token.h
@@ -0,0 +1,522 @@
+#ifndef SOURCETOOLS_TOKENIZATION_TOKEN_H
+#define SOURCETOOLS_TOKENIZATION_TOKEN_H
+
+#include <cstring>
+#include <cstdio>
+
+#include <vector>
+#include <string>
+#include <map>
+#include <sstream>
+
+#include <sourcetools/core/core.h>
+#include <sourcetools/tokenization/Registration.h>
+#include <sourcetools/collection/Position.h>
+#include <sourcetools/cursor/TextCursor.h>
+
+namespace sourcetools {
+namespace tokens {
+
+class Token
+{
+private:
+ typedef cursors::TextCursor TextCursor;
+ typedef collections::Position Position;
+
+public:
+
+ Token()
+ : begin_(NULL),
+ end_(NULL),
+ offset_(0),
+ type_(INVALID)
+ {
+ }
+
+ explicit Token(TokenType type)
+ : begin_(NULL),
+ end_(NULL),
+ offset_(0),
+ type_(type)
+ {
+ }
+
+ Token(const Position& position)
+ : begin_(NULL),
+ end_(NULL),
+ offset_(0),
+ position_(position),
+ type_(INVALID)
+ {
+ }
+
+ Token(const TextCursor& cursor, TokenType type, std::size_t length)
+ : begin_(cursor.begin() + cursor.offset()),
+ end_(cursor.begin() + cursor.offset() + length),
+ offset_(cursor.offset()),
+ position_(cursor.position()),
+ type_(type)
+ {
+ }
+
+ const char* begin() const { return begin_; }
+ const char* end() const { return end_; }
+ std::size_t offset() const { return offset_; }
+ std::size_t size() const { return end_ - begin_; }
+
+ std::string contents() const
+ {
+ return std::string(begin_, end_);
+ }
+
+ bool contentsEqual(const char* string)
+ {
+ return std::strcmp(begin_, string);
+ }
+
+ bool contentsEqual(const std::string& string) const
+ {
+ if (string.size() != size())
+ return false;
+
+ return std::memcmp(begin_, string.c_str(), size()) == 0;
+ }
+
+ const Position& position() const { return position_; }
+ std::size_t row() const { return position_.row; }
+ std::size_t column() const { return position_.column; }
+
+ TokenType type() const { return type_; }
+ bool isType(TokenType type) const { return type_ == type; }
+
+private:
+ const char* begin_;
+ const char* end_;
+ std::size_t offset_;
+
+ Position position_;
+ TokenType type_;
+};
+
+inline bool isBracket(const Token& token)
+{
+ return SOURCE_TOOLS_CHECK_MASK(token.type(), SOURCE_TOOLS_BRACKET_MASK);
+}
+
+inline bool isLeftBracket(const Token& token)
+{
+ return SOURCE_TOOLS_CHECK_MASK(token.type(), SOURCE_TOOLS_BRACKET_LEFT_MASK);
+}
+
+inline bool isRightBracket(const Token& token)
+{
+ return SOURCE_TOOLS_CHECK_MASK(token.type(), SOURCE_TOOLS_BRACKET_RIGHT_MASK);
+}
+
+inline bool isComplement(TokenType lhs, TokenType rhs)
+{
+ static const TokenType mask =
+ SOURCE_TOOLS_BRACKET_BIT | SOURCE_TOOLS_BRACKET_LEFT_BIT | SOURCE_TOOLS_BRACKET_RIGHT_BIT;
+
+ if (SOURCE_TOOLS_CHECK_MASK((lhs | rhs), mask))
+ return SOURCE_TOOLS_LOWER_BITS(lhs, 4) == SOURCE_TOOLS_LOWER_BITS(rhs, 4);
+
+ return false;
+}
+
+inline TokenType complement(TokenType type)
+{
+ static const TokenType mask =
+ SOURCE_TOOLS_BRACKET_LEFT_BIT | SOURCE_TOOLS_BRACKET_RIGHT_BIT;
+
+ return type ^ mask;
+}
+
+inline bool isKeyword(const Token& token)
+{
+ return SOURCE_TOOLS_CHECK_MASK(token.type(), SOURCE_TOOLS_KEYWORD_MASK);
+}
+
+inline bool isControlFlowKeyword(const Token& token)
+{
+ return SOURCE_TOOLS_CHECK_MASK(token.type(), SOURCE_TOOLS_KEYWORD_CONTROL_FLOW_MASK);
+}
+
+inline bool isOperator(const Token& token)
+{
+ return SOURCE_TOOLS_CHECK_MASK(token.type(), SOURCE_TOOLS_OPERATOR_MASK);
+}
+
+inline bool isUnaryOperator(const Token& token)
+{
+ return SOURCE_TOOLS_CHECK_MASK(token.type(), SOURCE_TOOLS_OPERATOR_UNARY_MASK);
+}
+
+inline bool isNonUnaryOperator(const Token& token)
+{
+ return isOperator(token) && !isUnaryOperator(token);
+}
+
+inline bool isComparisonOperator(const Token& token)
+{
+ switch (token.type())
+ {
+ case OPERATOR_AND_SCALAR:
+ case OPERATOR_AND_VECTOR:
+ case OPERATOR_OR_SCALAR:
+ case OPERATOR_OR_VECTOR:
+ case OPERATOR_EQUAL:
+ case OPERATOR_NOT_EQUAL:
+ case OPERATOR_LESS:
+ case OPERATOR_LESS_OR_EQUAL:
+ case OPERATOR_GREATER:
+ case OPERATOR_GREATER_OR_EQUAL:
+ return true;
+ default:
+ return false;
+ }
+}
+
+inline bool isWhitespace(const Token& token)
+{
+ return token.type() == WHITESPACE;
+}
+
+inline bool isComment(const Token& token)
+{
+ return token.type() == COMMENT;
+}
+
+inline bool isSymbol(const Token& token)
+{
+ return token.type() == SYMBOL;
+}
+
+inline bool isEnd(const Token& token)
+{
+ return token.type() == END;
+}
+
+inline bool isString(const Token& token)
+{
+ return token.type() == STRING;
+}
+
+inline bool isSymbolic(const Token& token)
+{
+ static const TokenType mask = SYMBOL | NUMBER | STRING;
+ return (token.type() & mask) != 0;
+}
+
+inline bool isNumeric(const Token& token)
+{
+ return (token.type() & NUMBER) != 0;
+}
+
+inline bool isCallOperator(const Token& token)
+{
+ return token.type() == LPAREN ||
+ token.type() == LBRACKET ||
+ token.type() == LDBRACKET;
+}
+
+inline bool isAssignmentOperator(const Token& token)
+{
+ switch (token.type())
+ {
+ case OPERATOR_ASSIGN_LEFT:
+ case OPERATOR_ASSIGN_LEFT_COLON:
+ case OPERATOR_ASSIGN_LEFT_EQUALS:
+ case OPERATOR_ASSIGN_LEFT_PARENT:
+ case OPERATOR_ASSIGN_RIGHT:
+ case OPERATOR_ASSIGN_RIGHT_PARENT:
+ return true;
+ default:
+ return false;
+ }
+}
+
+namespace detail {
+
+inline bool isHexDigit(char c)
+{
+ if (c >= '0' && c <= '9')
+ return true;
+ else if (c >= 'a' && c <= 'f')
+ return true;
+ else if (c >= 'A' && c <= 'F')
+ return true;
+ return false;
+}
+
+inline int hexValue(char c)
+{
+ if (c >= '0' && c <= '9')
+ return c - '0';
+ else if (c >= 'a' && c <= 'f')
+ return c - 'a' + 10;
+ else if (c >= 'A' && c <= 'F')
+ return c - 'A' + 10;
+
+ return 0;
+}
+
+// Parses an octal escape sequence, e.g. '\012'.
+inline bool parseOctal(const char*& it, char*& output)
+{
+ // Check for opening escape
+ if (*it != '\\')
+ return false;
+
+ // Check for number following
+ char lookahead = *(it + 1);
+ if (lookahead < '0' || lookahead > '7')
+ return false;
+ ++it;
+
+ // Begin parsing. Consume up to three numbers.
+ unsigned char result = 0;
+ const char* end = it + 3;
+ for (; it != end; ++it)
+ {
+ char ch = *it;
+ if ('0' <= ch && ch <= '7')
+ result = 8 * result + ch - '0';
+ else
+ break;
+ }
+
+ // Assign result, and return.
+ *output++ = result;
+ return true;
+}
+
+// Parse a hex escape sequence, e.g. '\xFF'.
+inline bool parseHex(const char*& it, char*& output)
+{
+ // Check for opening escape.
+ if (*it != '\\')
+ return false;
+
+ if (*(it + 1) != 'x')
+ return false;
+
+ if (!isHexDigit(*(it + 2)))
+ return false;
+
+ // Begin parsing.
+ it += 2;
+ unsigned char value = 0;
+ const char* end = it + 2;
+ for (; it != end; ++it)
+ {
+ int result = hexValue(*it);
+ if (result == 0)
+ break;
+ value = 16 * value + result;
+ }
+
+ *output++ = value;
+ return true;
+}
+
+// Parse a unicode escape sequence.
+inline bool parseUnicode(const char*& it, char*& output)
+{
+ if (*it != '\\')
+ return false;
+
+ char lookahead = *(it + 1);
+ int size;
+ if (lookahead == 'u')
+ size = 4;
+ else if (lookahead == 'U')
+ size = 8;
+ else
+ return false;
+
+ // Clone the input iterator (only set it on success)
+ const char* clone = it;
+ clone += 2;
+
+ // Check for e.g. '\u{...}'
+ // ^
+ bool delimited = *clone == '{';
+ clone += delimited;
+
+ // Check for a hex digit.
+ if (!isHexDigit(*clone))
+ return false;
+
+ // Begin parsing hex digits
+ wchar_t value = 0;
+ const char* end = clone + size;
+ for (; clone != end; ++clone)
+ {
+ if (!isHexDigit(*clone))
+ break;
+
+ int hex = hexValue(*clone);
+ value = 16 * value + hex;
+ }
+
+ // Eat a closing '}' if we had a starting '{'.
+ if (delimited)
+ {
+ if (*clone != '}')
+ return false;
+ ++clone;
+ }
+
+ std::mbstate_t state;
+ std::memset(&state, 0, sizeof(state));
+ std::size_t bytes = std::wcrtomb(output, value, &state);
+ if (bytes == static_cast<std::size_t>(-1))
+ return false;
+
+ // Update iterator state
+ it = clone;
+ output += bytes;
+ return true;
+}
+
+} // namespace detail
+
+inline std::string stringValue(const char* begin, const char* end)
+{
+ if (begin == end)
+ return std::string();
+
+ std::size_t n = end - begin;
+ scoped_array<char> buffer(new char[n + 1]);
+
+ const char* it = begin;
+ char* output = buffer;
+
+ while (it < end)
+ {
+ if (*it == '\\')
+ {
+ if (detail::parseOctal(it, output) ||
+ detail::parseHex(it, output) ||
+ detail::parseUnicode(it, output))
+ {
+ continue;
+ }
+
+ // Handle the rest
+ ++it;
+ switch (*it)
+ {
+ case 'a': *output++ = '\a'; break;
+ case 'b': *output++ = '\b'; break;
+ case 'f': *output++ = '\f'; break;
+ case 'n': *output++ = '\n'; break;
+ case 'r': *output++ = '\r'; break;
+ case 't': *output++ = '\t'; break;
+ case 'v': *output++ = '\v'; break;
+ case '\\': *output++ = '\\'; break;
+ default: *output++ = *it; break;
+ }
+ ++it;
+ }
+ else
+ {
+ *output++ = *it++;
+ }
+ }
+
+ // Ensure null termination, just in case
+ *output++ = '\0';
+
+ // Construct the result string and return
+ std::string result(buffer, output - buffer);
+ return result;
+}
+
+inline std::string stringValue(const Token& token)
+{
+ switch (token.type())
+ {
+ case STRING:
+ return stringValue(token.begin() + 1, token.end() - 1);
+ case SYMBOL:
+ if (*token.begin() == '`')
+ return stringValue(token.begin() + 1, token.end() - 1);
+ default:
+ return stringValue(token.begin(), token.end());
+ }
+}
+
+} // namespace tokens
+
+inline std::string toString(tokens::TokenType type)
+{
+ using namespace tokens;
+
+ if (type == INVALID) return "invalid";
+ else if (type == END) return "end";
+ else if (type == EMPTY) return "empty";
+ else if (type == MISSING) return "missing";
+ else if (type == SEMI) return "semi";
+ else if (type == COMMA) return "comma";
+ else if (type == SYMBOL) return "symbol";
+ else if (type == COMMENT) return "comment";
+ else if (type == WHITESPACE) return "whitespace";
+ else if (type == STRING) return "string";
+ else if (type == NUMBER) return "number";
+
+ else if (SOURCE_TOOLS_CHECK_MASK(type, SOURCE_TOOLS_BRACKET_MASK))
+ return "bracket";
+ else if (SOURCE_TOOLS_CHECK_MASK(type, SOURCE_TOOLS_KEYWORD_MASK))
+ return "keyword";
+ else if (SOURCE_TOOLS_CHECK_MASK(type, SOURCE_TOOLS_OPERATOR_MASK))
+ return "operator";
+
+ return "unknown";
+}
+
+inline std::string toString(const tokens::Token& token)
+{
+ std::string contents;
+ if (token.isType(tokens::END))
+ contents = "<END>";
+ else if (token.isType(tokens::EMPTY))
+ contents = "<empty>";
+ else if (token.isType(tokens::MISSING))
+ contents = "<missing>";
+ else
+ contents = token.contents();
+
+ static const int N = 1024;
+ if (contents.size() > N / 2)
+ contents = contents.substr(0, N / 2);
+ char buff[N];
+ std::sprintf(buff,
+ "[%4lu:%4lu]: %s",
+ static_cast<unsigned long>(token.row()),
+ static_cast<unsigned long>(token.column()),
+ contents.c_str());
+ return buff;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const tokens::Token& token)
+{
+ return os << toString(token);
+}
+
+inline std::ostream& operator<<(std::ostream& os, const std::vector<tokens::Token>& tokens)
+{
+ for (std::vector<tokens::Token>::const_iterator it = tokens.begin();
+ it != tokens.end();
+ ++it)
+ {
+ os << *it << std::endl;
+ }
+
+ return os;
+}
+
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_TOKENIZATION_TOKEN_H */
diff --git a/inst/include/sourcetools/tokenization/Tokenizer.h b/inst/include/sourcetools/tokenization/Tokenizer.h
new file mode 100644
index 0000000..3f601dd
--- /dev/null
+++ b/inst/include/sourcetools/tokenization/Tokenizer.h
@@ -0,0 +1,463 @@
+#ifndef SOURCETOOLS_TOKENIZATION_TOKENIZER_H
+#define SOURCETOOLS_TOKENIZATION_TOKENIZER_H
+
+#include <sourcetools/core/core.h>
+#include <sourcetools/tokenization/Token.h>
+#include <sourcetools/cursor/TextCursor.h>
+
+#include <vector>
+#include <stack>
+#include <sstream>
+
+namespace sourcetools {
+namespace tokenizer {
+
+class Tokenizer
+{
+private:
+ typedef tokens::Token Token;
+ typedef cursors::TextCursor TextCursor;
+ typedef tokens::TokenType TokenType;
+
+private:
+
+ // Tokenization ----
+
+ void consumeToken(TokenType type,
+ std::size_t length,
+ Token* pToken)
+ {
+ *pToken = Token(cursor_, type, length);
+ cursor_.advance(length);
+ }
+
+ template <bool SkipEscaped, bool InvalidOnError>
+ void consumeUntil(char ch,
+ TokenType type,
+ Token* pToken)
+ {
+ TextCursor lookahead = cursor_;
+
+ bool success = false;
+ std::size_t distance = 0;
+
+ while (lookahead != lookahead.end()) {
+ lookahead.advance();
+ ++distance;
+
+ if (SkipEscaped && lookahead.peek() == '\\') {
+ lookahead.advance();
+ ++distance;
+ continue;
+ }
+
+ if (lookahead.peek() == ch) {
+ success = true;
+ break;
+ }
+ }
+
+ if (success) {
+ consumeToken(type, distance + 1, pToken);
+ } else {
+ consumeToken(
+ InvalidOnError ? tokens::INVALID : type,
+ distance,
+ pToken
+ );
+ }
+ }
+
+ void consumeUserOperator(Token* pToken)
+ {
+ consumeUntil<false, true>('%', tokens::OPERATOR_USER, pToken);
+ }
+
+ void consumeComment(Token* pToken)
+ {
+ consumeUntil<false, false>('\n', tokens::COMMENT, pToken);
+ }
+
+ void consumeQuotedSymbol(Token* pToken)
+ {
+ consumeUntil<true, true>('`', tokens::SYMBOL, pToken);
+ }
+
+ void consumeQString(Token* pToken)
+ {
+ consumeUntil<true, true>('\'', tokens::STRING, pToken);
+ }
+
+ void consumeQQString(Token* pToken)
+ {
+ consumeUntil<true, true>('"', tokens::STRING, pToken);
+ }
+
+ // NOTE: Don't tokenize '-' or '+' as part of number; instead
+ // it's parsed as a unary operator.
+ bool isStartOfNumber()
+ {
+ char ch = cursor_.peek();
+ if (utils::isDigit(ch))
+ return true;
+ if (ch == '.')
+ return utils::isDigit(cursor_.peek(1));
+ return false;
+ }
+
+ bool isStartOfSymbol()
+ {
+ return utils::isValidForStartOfRSymbol(cursor_.peek());
+ }
+
+ bool consumeHexadecimalNumber(Token* pToken)
+ {
+ std::size_t distance = 0;
+
+ // Detect the leading '0'.
+ if (cursor_.peek(distance) != '0')
+ return false;
+ ++distance;
+
+ // Detect a 'x' or 'X'.
+ if (!(cursor_.peek(distance) == 'x' || cursor_.peek(distance) == 'X'))
+ return false;
+ ++distance;
+
+ // Check and consume all alphanumeric characters.
+ // The number is valid if the characters are valid
+ // hexadecimal characters (0-9, a-f, A-F). The number
+ // can also end with an 'i' (for an imaginary number)
+ // or with an 'L' for an integer.
+ if (!utils::isHexDigit(cursor_.peek(distance)))
+ {
+ consumeToken(tokens::INVALID, distance, pToken);
+ return false;
+ }
+
+ bool success = true;
+ char peek = cursor_.peek(distance);
+ while (utils::isAlphaNumeric(peek) && peek != '\0') {
+
+ // If we encounter an 'i' or an 'L', assume
+ // that this ends the identifier.
+ if (peek == 'i' || peek == 'L')
+ {
+ ++distance;
+ break;
+ }
+
+ if (!utils::isHexDigit(peek))
+ success = false;
+
+ ++distance;
+ peek = cursor_.peek(distance);
+ }
+
+ consumeToken(success ? tokens::NUMBER : tokens::INVALID, distance, pToken);
+ return true;
+ }
+
+ void consumeNumber(Token* pToken)
+ {
+ bool success = true;
+ std::size_t distance = 0;
+
+ // NOTE: A leading '-' or '+' is not consumed as part of
+ // the number.
+
+ // Try parsing this as a hexadecimal number first (e.g. '0xabc').
+ if (consumeHexadecimalNumber(pToken))
+ return;
+
+ // Consume digits
+ while (utils::isDigit(cursor_.peek(distance)))
+ ++distance;
+
+ // Consume a dot for decimals
+ // Note: '.5' is a valid specification for a number
+ // So is '100.'; ie, with a trailing decimal.
+ if (cursor_.peek(distance) == '.') {
+ ++distance;
+ while (utils::isDigit(cursor_.peek(distance)))
+ ++distance;
+ }
+
+ // Consume 'e', 'E' for exponential notation
+ if (cursor_.peek(distance) == 'e' || cursor_.peek(distance) == 'E') {
+ ++distance;
+
+ // Consume a '-' or a '+' for a negative number
+ if (cursor_.peek(distance) == '-' || cursor_.peek(distance) == '+')
+ ++distance;
+
+ // Parse another set of numbers following the E
+ success = utils::isDigit(cursor_.peek(distance));
+ while (utils::isDigit(cursor_.peek(distance)))
+ ++distance;
+
+ // Consume '.' and following numbers. Note that this is
+ // not really a valid number for R but it's better to tokenize
+ // this is a single entity (and then report failure later)
+ if (cursor_.peek(distance) == '.') {
+ success = false;
+ ++distance;
+ while (utils::isDigit(cursor_.peek(distance)))
+ ++distance;
+ }
+ }
+
+ // Consume a final 'L' for integer literals
+ if (cursor_.peek(distance) == 'L')
+ ++distance;
+
+ consumeToken(success ? tokens::NUMBER : tokens::INVALID, distance, pToken);
+ }
+
+ void consumeSymbol(Token* pToken)
+ {
+ std::size_t distance = 1;
+ char ch = cursor_.peek(distance);
+ while (utils::isValidForRSymbol(ch)) {
+ ++distance;
+ ch = cursor_.peek(distance);
+ }
+
+ const char* ptr = &*(cursor_.begin() + cursor_.offset());
+ consumeToken(tokens::symbolType(ptr, distance), distance, pToken);
+ }
+
+public:
+
+ Tokenizer(const char* code, std::size_t n)
+ : cursor_(code, n)
+ {
+ }
+
+ bool tokenize(Token* pToken)
+ {
+ if (cursor_ >= cursor_.end())
+ {
+ *pToken = Token(tokens::END);
+ return false;
+ }
+
+ char ch = cursor_.peek();
+ int n = 0;
+
+ // Block-related tokens
+ if (ch == '{')
+ consumeToken(tokens::LBRACE, 1, pToken);
+ else if (ch == '}')
+ consumeToken(tokens::RBRACE, 1, pToken);
+ else if (ch == '(')
+ consumeToken(tokens::LPAREN, 1, pToken);
+ else if (ch == ')')
+ consumeToken(tokens::RPAREN, 1, pToken);
+ else if (ch == '[') {
+ if (cursor_.peek(1) == '[') {
+ tokenStack_.push(tokens::LDBRACKET);
+ consumeToken(tokens::LDBRACKET, 2, pToken);
+ } else {
+ tokenStack_.push(tokens::LBRACKET);
+ consumeToken(tokens::LBRACKET, 1, pToken);
+ }
+ } else if (ch == ']') {
+ if (tokenStack_.empty()) {
+ consumeToken(tokens::INVALID, 1, pToken);
+ } else if (tokenStack_.top() == tokens::LDBRACKET) {
+ tokenStack_.pop();
+ if (cursor_.peek(1) == ']')
+ consumeToken(tokens::RDBRACKET, 2, pToken);
+ else
+ consumeToken(tokens::INVALID, 1, pToken);
+ } else {
+ tokenStack_.pop();
+ consumeToken(tokens::RBRACKET, 1, pToken);
+ }
+ }
+
+ // Operators
+ else if (ch == '<') // <<-, <=, <-, <
+ {
+ char next = cursor_.peek(1);
+ if (next == '-') // <-
+ consumeToken(tokens::OPERATOR_ASSIGN_LEFT, 2, pToken);
+ else if (next == '=') // <=
+ consumeToken(tokens::OPERATOR_LESS_OR_EQUAL, 2, pToken);
+ else if (next == '<' && cursor_.peek(2) == '-')
+ consumeToken(tokens::OPERATOR_ASSIGN_LEFT_PARENT, 3, pToken);
+ else
+ consumeToken(tokens::OPERATOR_LESS, 1, pToken);
+ }
+
+ else if (ch == '>') // >=, >
+ {
+ if (cursor_.peek(1) == '=')
+ consumeToken(tokens::OPERATOR_GREATER_OR_EQUAL, 2, pToken);
+ else
+ consumeToken(tokens::OPERATOR_GREATER, 1, pToken);
+ }
+ else if (ch == '=') // '==', '='
+ {
+ if (cursor_.peek(1) == '=')
+ consumeToken(tokens::OPERATOR_EQUAL, 2, pToken);
+ else
+ consumeToken(tokens::OPERATOR_ASSIGN_LEFT_EQUALS, 1, pToken);
+ }
+ else if (ch == '|') // '||', '|'
+ {
+ if (cursor_.peek(1) == '|')
+ consumeToken(tokens::OPERATOR_OR_SCALAR, 2, pToken);
+ else
+ consumeToken(tokens::OPERATOR_OR_VECTOR, 1, pToken);
+ }
+ else if (ch == '&') // '&&', '&'
+ {
+ if (cursor_.peek(1) == '&')
+ consumeToken(tokens::OPERATOR_AND_SCALAR, 2, pToken);
+ else
+ consumeToken(tokens::OPERATOR_AND_VECTOR, 1, pToken);
+ }
+ else if (ch == '*') // **, *
+ {
+ if (cursor_.peek(1) == '*')
+ consumeToken(tokens::OPERATOR_EXPONENTATION_STARS, 2, pToken);
+ else
+ consumeToken(tokens::OPERATOR_MULTIPLY, 1, pToken);
+ }
+ else if (ch == ':') // ':::', '::', ':=', ':'
+ {
+ if (cursor_.peek(1) == ':')
+ {
+ if (cursor_.peek(2) == ':')
+ consumeToken(tokens::OPERATOR_NAMESPACE_ALL, 3, pToken);
+ else
+ consumeToken(tokens::OPERATOR_NAMESPACE_EXPORTS, 2, pToken);
+ }
+ else if (cursor_.peek(1) == '=')
+ consumeToken(tokens::OPERATOR_ASSIGN_LEFT_COLON, 2, pToken);
+ else
+ consumeToken(tokens::OPERATOR_SEQUENCE, 1, pToken);
+ }
+ else if (ch == '!')
+ {
+ if (cursor_.peek(1) == '=')
+ consumeToken(tokens::OPERATOR_NOT_EQUAL, 2, pToken);
+ else
+ consumeToken(tokens::OPERATOR_NEGATION, 1, pToken);
+ }
+ else if (ch == '-') // '->>', '->', '-'
+ {
+ if (cursor_.peek(1) == '>')
+ {
+ if (cursor_.peek(2) == '>')
+ consumeToken(tokens::OPERATOR_ASSIGN_RIGHT_PARENT, 3, pToken);
+ else
+ consumeToken(tokens::OPERATOR_ASSIGN_RIGHT, 2, pToken);
+ }
+ else
+ consumeToken(tokens::OPERATOR_MINUS, 1, pToken);
+ }
+ else if (ch == '+')
+ consumeToken(tokens::OPERATOR_PLUS, 1, pToken);
+ else if (ch == '~')
+ consumeToken(tokens::OPERATOR_FORMULA, 1, pToken);
+ else if (ch == '?')
+ consumeToken(tokens::OPERATOR_HELP, 1, pToken);
+ else if (ch == '/')
+ consumeToken(tokens::OPERATOR_DIVIDE, 1, pToken);
+ else if (ch == '@')
+ consumeToken(tokens::OPERATOR_AT, 1, pToken);
+ else if (ch == '$')
+ consumeToken(tokens::OPERATOR_DOLLAR, 1, pToken);
+ else if (ch == '^')
+ consumeToken(tokens::OPERATOR_HAT, 1, pToken);
+
+ // User operators
+ else if (ch == '%')
+ consumeUserOperator(pToken);
+
+ // Punctuation-related tokens
+ else if (ch == ',')
+ consumeToken(tokens::COMMA, 1, pToken);
+ else if (ch == ';')
+ consumeToken(tokens::SEMI, 1, pToken);
+
+ // Whitespace
+ else if (utils::countWhitespaceBytes(cursor_, &n))
+ consumeToken(tokens::WHITESPACE, n, pToken);
+
+ // Strings and symbols
+ else if (ch == '\'')
+ consumeQString(pToken);
+ else if (ch == '"')
+ consumeQQString(pToken);
+ else if (ch == '`')
+ consumeQuotedSymbol(pToken);
+
+ // Comments
+ else if (ch == '#')
+ consumeComment(pToken);
+
+ // Number
+ else if (isStartOfNumber())
+ consumeNumber(pToken);
+
+ // Symbol
+ else if (isStartOfSymbol())
+ consumeSymbol(pToken);
+
+ // Nothing matched -- error
+ else
+ consumeToken(tokens::INVALID, 1, pToken);
+
+ return true;
+ }
+
+ Token peek(std::size_t lookahead = 1)
+ {
+ Tokenizer clone(*this);
+
+ Token result(tokens::END);
+ for (std::size_t i = 0; i < lookahead; ++i) {
+ if (!clone.tokenize(&result)) {
+ break;
+ }
+ }
+
+ return result;
+ }
+
+private:
+ TextCursor cursor_;
+ std::stack<TokenType, std::vector<TokenType> > tokenStack_;
+};
+
+} // namespace tokenizer
+
+inline std::vector<tokens::Token> tokenize(const char* code, std::size_t n)
+{
+ typedef tokenizer::Tokenizer Tokenizer;
+ typedef tokens::Token Token;
+
+ std::vector<Token> tokens;
+ if (n == 0)
+ return tokens;
+
+ Token token;
+ Tokenizer tokenizer(code, n);
+ while (tokenizer.tokenize(&token))
+ tokens.push_back(token);
+
+ return tokens;
+}
+
+inline std::vector<tokens::Token> tokenize(const std::string& code)
+{
+ return tokenize(code.data(), code.size());
+}
+
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_TOKENIZATION_TOKENIZER_H */
diff --git a/inst/include/sourcetools/tokenization/tokenization.h b/inst/include/sourcetools/tokenization/tokenization.h
new file mode 100644
index 0000000..e1dee85
--- /dev/null
+++ b/inst/include/sourcetools/tokenization/tokenization.h
@@ -0,0 +1,8 @@
+#ifndef SOURCETOOLS_TOKENIZATION_TOKENIZATION_H
+#define SOURCETOOLS_TOKENIZATION_TOKENIZATION_H
+
+#include <sourcetools/tokenization/Registration.h>
+#include <sourcetools/tokenization/Token.h>
+#include <sourcetools/tokenization/Tokenizer.h>
+
+#endif /* SOURCETOOLS_TOKENIZATION_TOKENIZATION_H */
diff --git a/inst/include/sourcetools/utf8/utf8.h b/inst/include/sourcetools/utf8/utf8.h
new file mode 100644
index 0000000..96e0c5e
--- /dev/null
+++ b/inst/include/sourcetools/utf8/utf8.h
@@ -0,0 +1,115 @@
+#ifndef SOURCETOOLS_UTF8_UTF8_H
+#define SOURCETOOLS_UTF8_UTF8_H
+
+#include <cstddef>
+
+#include <sourcetools/core/core.h>
+
+namespace sourcetools {
+namespace utf8 {
+
+namespace detail {
+static const unsigned char mask[] = {
+ 0, // 00000000
+ 0x7F, // 01111111
+ 0x1F, // 00011111
+ 0x0F, // 00001111
+ 0x07, // 00000111
+ 0x03, // 00000011
+ 0x01 // 00000001
+};
+} // namespace detail
+
+class iterator
+{
+public:
+ iterator(const char* data)
+ : data_(reinterpret_cast<const unsigned char*>(data)),
+ offset_(0)
+ {
+ }
+
+ iterator(const iterator& other)
+ : data_(other.data_),
+ offset_(other.offset_)
+ {
+ }
+
+ wchar_t operator*()
+ {
+ std::size_t n = size();
+ if (n == 0 || n > 6)
+ return -1;
+
+ const unsigned char* it = data_ + offset_;
+ wchar_t ch = (*it++) & detail::mask[n];
+ for (std::size_t i = 1; i < n; ++i)
+ {
+ ch <<= 6;
+ ch |= (*it++) & 0x3F;
+ }
+
+ return ch;
+ }
+
+ iterator& operator++()
+ {
+ offset_ += size();
+ return *this;
+ }
+
+ iterator operator++(int)
+ {
+ iterator copy(*this);
+ operator++();
+ return copy;
+ }
+
+ bool operator==(const iterator& it)
+ {
+ return
+ data_ + offset_ ==
+ it.data_ + it.offset_;
+ }
+
+ bool operator!=(const iterator& it)
+ {
+ return
+ data_ + offset_ !=
+ it.data_ + it.offset_;
+ }
+
+private:
+
+ int size()
+ {
+ unsigned char ch = data_[offset_];
+ if (ch == 0)
+ return 0;
+ else if (ch < 192)
+ return 1;
+ else if (ch < 224)
+ return 2;
+ else if (ch < 240)
+ return 3;
+ else if (ch < 248)
+ return 4;
+ else if (ch < 252)
+ return 5;
+ else if (ch < 254)
+ return 6;
+
+ // TODO: on error?
+ return 1;
+ }
+
+private:
+
+ const unsigned char* data_;
+ std::size_t offset_;
+};
+
+} // namespace utf8
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_UTF8_UTF8_H */
diff --git a/man/read.Rd b/man/read.Rd
new file mode 100644
index 0000000..a3223f3
--- /dev/null
+++ b/man/read.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/sourcetools.R
+\name{read}
+\alias{read}
+\alias{read_bytes}
+\alias{read_lines}
+\alias{read_lines_bytes}
+\title{Read the Contents of a File}
+\usage{
+read(path)
+
+read_lines(path)
+
+read_bytes(path)
+
+read_lines_bytes(path)
+}
+\arguments{
+\item{path}{A file path.}
+}
+\description{
+Read the contents of a file into a string (or, in the case of
+\code{read_lines}, a vector of strings).
+}
+
diff --git a/man/tokenize-methods.Rd b/man/tokenize-methods.Rd
new file mode 100644
index 0000000..4f1da94
--- /dev/null
+++ b/man/tokenize-methods.Rd
@@ -0,0 +1,42 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/sourcetools.R
+\name{tokenize_file}
+\alias{tokenize}
+\alias{tokenize_file}
+\alias{tokenize_string}
+\title{Tokenize R Code}
+\usage{
+tokenize_file(path)
+
+tokenize_string(string)
+
+tokenize(file = "", text = NULL)
+}
+\arguments{
+\item{file, path}{A file path.}
+
+\item{text, string}{\R code as a character vector of length one.}
+}
+\value{
+A \code{data.frame} with the following columns:
+
+\tabular{ll}{
+\code{value} \tab The token's contents, as a string. \cr
+\code{row} \tab The row where the token is located. \cr
+\code{column} \tab The column where the token is located. \cr
+\code{type} \tab The token type, as a string. \cr
+}
+}
+\description{
+Tools for tokenizing \R code.
+}
+\note{
+Line numbers are determined by existence of the \code{\\n}
+line feed character, under the assumption that code being tokenized
+will use either \code{\\n} to indicate newlines (as on modern
+Unix systems), or \code{\\r\\n} as on Windows.
+}
+\examples{
+tokenize_string("x <- 1 + 2")
+}
+
diff --git a/src/Makevars b/src/Makevars
new file mode 100644
index 0000000..4340efb
--- /dev/null
+++ b/src/Makevars
@@ -0,0 +1 @@
+PKG_CPPFLAGS = -I../inst/include
diff --git a/src/Makevars.win b/src/Makevars.win
new file mode 100644
index 0000000..4340efb
--- /dev/null
+++ b/src/Makevars.win
@@ -0,0 +1 @@
+PKG_CPPFLAGS = -I../inst/include
diff --git a/src/Reader.cpp b/src/Reader.cpp
new file mode 100644
index 0000000..331ba0c
--- /dev/null
+++ b/src/Reader.cpp
@@ -0,0 +1,88 @@
+#include <cstring>
+
+#include <sourcetools/read/read.h>
+#include <sourcetools/r/r.h>
+
+#define R_NO_REMAP
+#include <R.h>
+#include <Rinternals.h>
+
+extern "C" SEXP sourcetools_read(SEXP absolutePathSEXP)
+{
+ const char* absolutePath = CHAR(STRING_ELT(absolutePathSEXP, 0));
+
+ std::string contents;
+ bool result = sourcetools::read(absolutePath, &contents);
+ if (!result)
+ {
+ Rf_warning("Failed to read file");
+ return R_NilValue;
+ }
+
+ sourcetools::r::Protect protect;
+ SEXP resultSEXP = protect(Rf_allocVector(STRSXP, 1));
+ SET_STRING_ELT(resultSEXP, 0, Rf_mkCharLen(contents.c_str(), contents.size()));
+ return resultSEXP;
+}
+
+extern "C" SEXP sourcetools_read_lines(SEXP absolutePathSEXP)
+{
+ const char* absolutePath = CHAR(STRING_ELT(absolutePathSEXP, 0));
+
+ std::vector<std::string> lines;
+ bool result = sourcetools::read_lines(absolutePath, &lines);
+ if (!result)
+ {
+ Rf_warning("Failed to read file");
+ return R_NilValue;
+ }
+
+ std::size_t n = lines.size();
+ sourcetools::r::Protect protect;
+ SEXP resultSEXP = protect(Rf_allocVector(STRSXP, n));
+ for (std::size_t i = 0; i < n; ++i)
+ SET_STRING_ELT(resultSEXP, i, Rf_mkCharLen(lines[i].c_str(), lines[i].size()));
+ return resultSEXP;
+}
+
+extern "C" SEXP sourcetools_read_bytes(SEXP absolutePathSEXP)
+{
+ const char* absolutePath = CHAR(STRING_ELT(absolutePathSEXP, 0));
+
+ std::string contents;
+ bool result = sourcetools::read(absolutePath, &contents);
+ if (!result)
+ {
+ Rf_warning("Failed to read file");
+ return R_NilValue;
+ }
+
+ sourcetools::r::Protect protect;
+ SEXP resultSEXP = protect(Rf_allocVector(RAWSXP, contents.size()));
+ std::memcpy(RAW(resultSEXP), contents.c_str(), contents.size());
+ return resultSEXP;
+}
+
+extern "C" SEXP sourcetools_read_lines_bytes(SEXP absolutePathSEXP)
+{
+ const char* absolutePath = CHAR(STRING_ELT(absolutePathSEXP, 0));
+
+ std::vector<std::string> lines;
+ bool result = sourcetools::read_lines(absolutePath, &lines);
+ if (!result)
+ {
+ Rf_warning("Failed to read file");
+ return R_NilValue;
+ }
+
+ std::size_t n = lines.size();
+ sourcetools::r::Protect protect;
+ SEXP resultSEXP = protect(Rf_allocVector(VECSXP, n));
+ for (std::size_t i = 0; i < n; ++i)
+ {
+ SEXP rawSEXP = Rf_allocVector(RAWSXP, lines[i].size());
+ std::memcpy(RAW(rawSEXP), lines[i].c_str(), lines[i].size());
+ SET_VECTOR_ELT(resultSEXP, i, rawSEXP);
+ }
+ return resultSEXP;
+}
diff --git a/src/Tokenizer.cpp b/src/Tokenizer.cpp
new file mode 100644
index 0000000..1fe8bb1
--- /dev/null
+++ b/src/Tokenizer.cpp
@@ -0,0 +1,96 @@
+#include <sourcetools.h>
+
+#define R_NO_REMAP
+#include <R.h>
+#include <Rinternals.h>
+
+namespace sourcetools {
+namespace {
+
+void asDataFrame(SEXP listSEXP, int n)
+{
+ r::Protect protect;
+ SEXP classSEXP = protect(Rf_mkString("data.frame"));
+ Rf_setAttrib(listSEXP, R_ClassSymbol, classSEXP);
+
+ SEXP rownamesSEXP = protect(Rf_allocVector(INTSXP, 2));
+ INTEGER(rownamesSEXP)[0] = NA_INTEGER;
+ INTEGER(rownamesSEXP)[1] = -n;
+ Rf_setAttrib(listSEXP, R_RowNamesSymbol, rownamesSEXP);
+}
+
+SEXP asSEXP(const std::vector<tokens::Token>& tokens)
+{
+ r::Protect protect;
+ std::size_t n = tokens.size();
+ SEXP resultSEXP = protect(Rf_allocVector(VECSXP, 4));
+
+ // Set vector elements
+ SEXP valueSEXP = protect(Rf_allocVector(STRSXP, n));
+ SET_VECTOR_ELT(resultSEXP, 0, valueSEXP);
+ for (std::size_t i = 0; i < n; ++i) {
+ const std::string& contents = tokens[i].contents();
+ SEXP charSEXP = Rf_mkCharLen(contents.c_str(), contents.size());
+ SET_STRING_ELT(valueSEXP, i, charSEXP);
+ }
+
+ SEXP rowSEXP = protect(Rf_allocVector(INTSXP, n));
+ SET_VECTOR_ELT(resultSEXP, 1, rowSEXP);
+ for (std::size_t i = 0; i < n; ++i)
+ INTEGER(rowSEXP)[i] = tokens[i].row() + 1;
+
+ SEXP columnSEXP = protect(Rf_allocVector(INTSXP, n));
+ SET_VECTOR_ELT(resultSEXP, 2, columnSEXP);
+ for (std::size_t i = 0; i < n; ++i)
+ INTEGER(columnSEXP)[i] = tokens[i].column() + 1;
+
+ SEXP typeSEXP = protect(Rf_allocVector(STRSXP, n));
+ SET_VECTOR_ELT(resultSEXP, 3, typeSEXP);
+ for (std::size_t i = 0; i < n; ++i) {
+ const std::string& type = toString(tokens[i].type());
+ SEXP charSEXP = Rf_mkCharLen(type.c_str(), type.size());
+ SET_STRING_ELT(typeSEXP, i, charSEXP);
+ }
+
+ // Set names
+ SEXP namesSEXP = protect(Rf_allocVector(STRSXP, 4));
+
+ SET_STRING_ELT(namesSEXP, 0, Rf_mkChar("value"));
+ SET_STRING_ELT(namesSEXP, 1, Rf_mkChar("row"));
+ SET_STRING_ELT(namesSEXP, 2, Rf_mkChar("column"));
+ SET_STRING_ELT(namesSEXP, 3, Rf_mkChar("type"));
+
+ Rf_setAttrib(resultSEXP, R_NamesSymbol, namesSEXP);
+
+ asDataFrame(resultSEXP, n);
+
+ return resultSEXP;
+}
+
+} // anonymous namespace
+} // namespace sourcetools
+
+extern "C" SEXP sourcetools_tokenize_file(SEXP absolutePathSEXP)
+{
+ typedef sourcetools::tokens::Token Token;
+
+ const char* absolutePath = CHAR(STRING_ELT(absolutePathSEXP, 0));
+ std::string contents;
+ if (!sourcetools::read(absolutePath, &contents))
+ {
+ Rf_warning("Failed to read file");
+ return R_NilValue;
+ }
+
+ const std::vector<Token>& tokens = sourcetools::tokenize(contents);
+ return sourcetools::asSEXP(tokens);
+}
+
+extern "C" SEXP sourcetools_tokenize_string(SEXP stringSEXP)
+{
+ typedef sourcetools::tokens::Token Token;
+ SEXP charSEXP = STRING_ELT(stringSEXP, 0);
+ const std::vector<Token>& tokens =
+ sourcetools::tokenize(CHAR(charSEXP), Rf_length(charSEXP));
+ return sourcetools::asSEXP(tokens);
+}
diff --git a/tests/testthat.R b/tests/testthat.R
new file mode 100644
index 0000000..c610b60
--- /dev/null
+++ b/tests/testthat.R
@@ -0,0 +1,4 @@
+if (require("testthat", quietly = TRUE)) {
+ library(sourcetools)
+ test_check("sourcetools")
+}
diff --git a/tests/testthat/helper-utf8.R b/tests/testthat/helper-utf8.R
new file mode 100644
index 0000000..6da2209
--- /dev/null
+++ b/tests/testthat/helper-utf8.R
@@ -0,0 +1,3 @@
+octal <- "\012"
+hex <- "\xE2\x99\xA5"
+utf8 <- "\u2665"
diff --git a/tests/testthat/test-read.R b/tests/testthat/test-read.R
new file mode 100644
index 0000000..6c37e20
--- /dev/null
+++ b/tests/testthat/test-read.R
@@ -0,0 +1,30 @@
+context("Reader")
+
+files <- list.files()
+
+test_that("read_lines and readLines agree on output", {
+ for (file in files) {
+ expect_identical(
+ readLines(file),
+ sourcetools::read_lines(file)
+ )
+ }
+})
+
+test_that("read and readChar agree on output", {
+ for (file in files) {
+ expect_identical(
+ readChar(file, file.info(file)$size, TRUE),
+ sourcetools::read(file)
+ )
+ }
+})
+
+test_that("read_bytes and readBin agree on output", {
+ for (file in files) {
+ expect_identical(
+ readBin(file, "raw", file.info(file)$size),
+ sourcetools::read_bytes(file)
+ )
+ }
+})
diff --git a/tests/testthat/test-tokenize.R b/tests/testthat/test-tokenize.R
new file mode 100644
index 0000000..ba0eee2
--- /dev/null
+++ b/tests/testthat/test-tokenize.R
@@ -0,0 +1,165 @@
+context("Tokenizer")
+
+compare_tokens <- function(tokens, expected) {
+
+ if (is.character(tokens))
+ tokens <- tokenize_string(tokens)
+
+ expect_true(
+ nrow(tokens) == length(expected),
+ "different number of tokens"
+ )
+
+ for (i in 1:nrow(tokens)) {
+ expect_true(
+ tokens$value[[i]] == expected[[i]],
+ paste0("expected token '", tokens$value[[i]], "'; got '", expected[[i]], "'")
+ )
+ }
+
+}
+
+test_that("Operators are tokenized correctly", {
+
+ operators <- c(
+ "::", ":::", "$", "@", "[", "[[", "^", "-", "+", ":",
+ "*", "/", "+", "-", "<", ">", "<=", ">=", "==", "!=",
+ "!", "&", "&&", "|", "||", "~", "->", "->>", "<-", "<<-",
+ "=", "?", "**", "%%", "%for%"
+ )
+
+ tokenized <- tokenize_string(paste(operators, collapse = " "))
+
+ for (operator in operators) {
+ tokens <- tokenize_string(operator)
+ expect_true(nrow(tokens) == 1, paste("expected a single token ('", operator, "')"))
+ }
+})
+
+test_that("Numbers are tokenized correctly", {
+
+ numbers <- c("1", "1.0", "0.1", ".1", "0.1E1", "1L", "1.0L", "1.5L",
+ "1E1", "1E-1", "1E-1L", ".100E-105L", "0.", "100.",
+ "1e+09", "1e+90", "1e-90", "1e-00000000000000009")
+
+ for (number in numbers) {
+ tokens <- tokenize_string(number)
+ expect_true(nrow(tokens) == 1, paste("expected a single token ('", number, "')", sep = ""))
+ token <- as.list(tokens[1, ])
+ expect_true(token$type == "number", paste("expected a number ('", token$type, "')", sep = ""))
+ }
+
+})
+
+test_that("The tokenizer accepts UTF-8 symbols", {
+ expect_true(nrow(tokenize_string("鬼")) == 1)
+})
+
+test_that("The tokenizer works correctly", {
+
+ # TODO: Should newlines be absorbed as part of the comment string?
+ tokens <- tokenize_string("# A Comment\n")
+ expected <- "# A Comment\n"
+ compare_tokens(tokens, expected)
+
+ tokens <- tokenize_string("a <- 1 + 2\n")
+ compare_tokens(
+ tokens,
+ c("a", " ", "<-", " ", "1", " ", "+", " ", "2", "\n")
+ )
+
+ compare_tokens(
+ tokenize_string("a<-1"),
+ c("a", "<-", "1")
+ )
+
+ # NOTE: '-' sign tokenized separately from number
+ compare_tokens(
+ tokenize_string("a< -1"),
+ c("a", "<", " ", "-", "1")
+ )
+
+ compare_tokens("1.0E5L", "1.0E5L")
+ compare_tokens(".1", ".1")
+ compare_tokens("'\\''", "'\\''")
+ compare_tokens(".a", ".a")
+ compare_tokens("...", "...")
+ compare_tokens(":=", ":=")
+ compare_tokens("x ** 2", c("x", " ", "**", " ", "2"))
+
+})
+
+test_that("`[[` and `[` are tokenized correctly", {
+
+ compare_tokens("x[[1]]", c("x", "[[", "1", "]]"))
+
+ # not really valid R code, but the tokenizer should still
+ # get it right
+ compare_tokens("[[[]]]", c("[[", "[", "]", "]]"))
+
+ compare_tokens(
+ "x[[a[b[[c[1]]]]]]",
+ c("x", "[[", "a", "[", "b", "[[", "c", "[", "1",
+ "]", "]]", "]", "]]")
+ )
+
+})
+
+test_that("Failures during number tokenization is detected", {
+ tokens <- tokenize_string("1.5E---")
+ expect_true(tokens$type[[1]] == "invalid")
+})
+
+test_that("invalid number e.g. 1E1.5 tokenized as single entity", {
+ tokens <- tokenize_string("1E1.5")
+ expect_true(nrow(tokens) == 1)
+ expect_true(tokens$type[[1]] == "invalid")
+})
+
+test_that("keywords are tokenized as keywords", {
+
+ keywords <- c("if", "else", "repeat", "while", "function",
+ "for", "in", "next", "break",
+ "TRUE", "FALSE", "NULL", "Inf", "NaN", "NA",
+ "NA_integer_", "NA_real_", "NA_complex_", "NA_character_")
+
+ tokens <- lapply(keywords, function(keyword) {
+ tokenize_string(keyword)[1, ]
+ })
+
+ types <- unlist(lapply(tokens, `[[`, "type"))
+ expect_true(all(types == "keyword"))
+})
+
+test_that("comments without a trailing newline are tokenized", {
+ tokens <- tokenize_string("# abc")
+ expect_identical(tokens$type, "comment")
+})
+
+test_that("tokenization errors handled correctly", {
+ # previously, these reported an error where a NUL
+ # byte was accidentally included as part of the
+ # token value
+ tokenize_string("`abc")
+ tokenize_string("'abc")
+ tokenize_string("\"abc")
+ tokenize_string("%abc")
+})
+
+test_that("files in packages are tokenized without errors", {
+ skip_on_cran()
+
+ paths <- list.dirs("~/git", full.names = TRUE, recursive = FALSE)
+ packages <- paths[file.exists(file.path(paths, "DESCRIPTION"))]
+ R <- file.path(packages, "R")
+
+ for (dir in R) {
+ files <- list.files(dir, pattern = "R$", full.names = TRUE)
+ for (file in files) {
+ tokens <- tokenize_file(file)
+ errors <- tokens$type == "invalid"
+ expect_true(all(errors == FALSE))
+ }
+ }
+
+})
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/r-cran-sourcetools.git
More information about the debian-med-commit
mailing list